diff --git a/.yardopts b/.yardopts index bd2c740114..d6ce1abb52 100644 --- a/.yardopts +++ b/.yardopts @@ -1,5 +1,8 @@ --embed-mixins --main=README.md +--exclude=lib/nokogiri/css/tokenizer.rb +--exclude=lib/nokogiri/css/parser.rb +--exclude=ext/nokogiri/test_global_handlers.c lib/**/*.rb ext/nokogiri/*.c - diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bcf222930..e1426763ff 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,17 +6,25 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ## next / unreleased -### Added +### Notable Addition: HTML5 Support (CRuby only) __HTML5 support__ has been added (to CRuby only) by merging [Nokogumbo](https://github.com/rubys/nokogumbo) into Nokogiri. The Nokogumbo public API has been preserved, so this functionality is available under the `Nokogiri::HTML5` namespace. [[#2204](https://github.com/sparklemotion/nokogiri/issues/2204)] Please note that HTML5 support is not available for JRuby in this version. However, we feel it is important to think about JRuby and we hope to work on this in the future. If you're interested in helping with HTML5 support on JRuby, please reach out to the maintainers by commenting on issue [#2227](https://github.com/sparklemotion/nokogiri/issues/2227). -Please also note that the `Nokogiri::HTML` parse methods still use libxml2's HTML4 parser in the v1.12 release series. Future releases of Nokogiri may change this behavior, but we'll proceed cautiously to avoid breaking existing applications. - Many thanks to Sam Ruby, Steve Checkoway, and Craig Barnes for creating and maintaining Nokogumbo and supporting the Gumbo HTML5 parser. They're now Nokogiri core contributors with all the powers and privileges pertaining thereto. ๐Ÿ™Œ -#### Other + +### Notable Change: `Nokogiri::HTML4` module and namespace + +`Nokogiri::HTML` has been renamed to `Nokogiri::HTML4`, and `Nokogiri::HTML` is aliased to preserve backwards-compatibility. `Nokogiri::HTML` and `Nokogiri::HTML4` parse methods still use libxml2's (or NekoHTML's) HTML4 parser in the v1.12 release series. + +Take special note that if you rely on the class name of an object in your code, objects will now report a class of `Nokogiri::HTML4::Foo` where they previously reported `Nokogiri::HTML::Foo`. Instead of relying on the string returned by `Object#class`, prefer `Class#===` or `Object#is_a?` or `Object#instance_of?`. + +Future releases of Nokogiri may deprecate `HTML` methods or otherwise change this behavior, so please start using `HTML4` in place of `HTML`. + + +### Added * [CRuby] `Nokogiri::VERSION_INFO["libxslt"]["datetime_enabled"]` is a new boolean value which describes whether libxslt (or, more properly, libexslt) has compiled-in datetime support. This generally going to be `true`, but some distros ship without this support (e.g., some mingw UCRT-based packages, see https://github.com/msys2/MINGW-packages/pull/8957). See [#2272](https://github.com/sparklemotion/nokogiri/issues/2272) for more details. @@ -38,6 +46,11 @@ Many thanks to Sam Ruby, Steve Checkoway, and Craig Barnes for creating and main * [CRuby] Speed up (slightly) the compile time of packaged libraries `libiconv`, `libxml2`, and `libxslt` by using autoconf's `--disable-dependency-tracking` option. ("ruby" platform gem only.) +### Deprecated + +* Deprecating Nokogumbo's `Nokogiri::HTML5.get`. This method will be removed in a future version of Nokogiri. + + ### Dependencies * [CRuby] Upgrade mini_portile2 dependency from `~> 2.5.0` to `~> 2.6.1`. ("ruby" platform gem only.) diff --git a/README.md b/README.md index 526d10f6ef..59312cd7a6 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ Some guiding principles Nokogiri tries to follow: ## Features Overview -- DOM Parser for XML and HTML4 +- DOM Parser for XML, HTML4, and HTML5 - SAX Parser for XML and HTML4 - Push Parser for XML and HTML4 - Document search via XPath 1.0 diff --git a/ext/java/nokogiri/HtmlDocument.java b/ext/java/nokogiri/Html4Document.java similarity index 83% rename from ext/java/nokogiri/HtmlDocument.java rename to ext/java/nokogiri/Html4Document.java index 7b95984793..bf1c660545 100644 --- a/ext/java/nokogiri/HtmlDocument.java +++ b/ext/java/nokogiri/Html4Document.java @@ -18,13 +18,13 @@ import static nokogiri.internals.NokogiriHelpers.getNokogiriClass; /** - * Class for Nokogiri::HTML::Document. + * Class for Nokogiri::HTML4::Document. * * @author sergio * @author Yoko Harada */ -@JRubyClass(name = "Nokogiri::HTML::Document", parent = "Nokogiri::XML::Document") -public class HtmlDocument extends XmlDocument +@JRubyClass(name = "Nokogiri::HTML4::Document", parent = "Nokogiri::XML::Document") +public class Html4Document extends XmlDocument { private static final String DEFAULT_CONTENT_TYPE = "html"; private static final String DEFAULT_PUBLIC_ID = "-//W3C//DTD HTML 4.01//EN"; @@ -33,19 +33,19 @@ public class HtmlDocument extends XmlDocument private String parsed_encoding = null; public - HtmlDocument(Ruby ruby, RubyClass klazz) + Html4Document(Ruby ruby, RubyClass klazz) { super(ruby, klazz); } public - HtmlDocument(Ruby runtime, Document document) + Html4Document(Ruby runtime, Document document) { this(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"), document); } public - HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) + Html4Document(Ruby ruby, RubyClass klazz, Document doc) { super(ruby, klazz, doc); } @@ -55,10 +55,10 @@ public class HtmlDocument extends XmlDocument rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) { final Ruby runtime = context.runtime; - HtmlDocument htmlDocument; + Html4Document htmlDocument; try { Document docNode = createNewDocument(runtime); - htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(runtime, (RubyClass) klazz); + htmlDocument = (Html4Document) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(runtime, (RubyClass) klazz); htmlDocument.setDocumentNode(context.runtime, docNode); } catch (Exception ex) { throw asRuntimeError(runtime, "couldn't create document: ", ex); @@ -135,13 +135,6 @@ public class HtmlDocument extends XmlDocument return parsed_encoding; } - /* - * call-seq: - * read_io(io, url, encoding, options) - * - * Read the HTML document from +io+ with given +url+, +encoding+, - * and +options+. See Nokogiri::HTML.parse - */ @JRubyMethod(meta = true, required = 4) public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) @@ -151,13 +144,6 @@ public class HtmlDocument extends XmlDocument return ctx.parse(context, (RubyClass) klass, args[1]); } - /* - * call-seq: - * read_memory(string, url, encoding, options) - * - * Read the HTML document contained in +string+ with given +url+, +encoding+, - * and +options+. See Nokogiri::HTML.parse - */ @JRubyMethod(meta = true, required = 4) public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) diff --git a/ext/java/nokogiri/HtmlElementDescription.java b/ext/java/nokogiri/Html4ElementDescription.java similarity index 90% rename from ext/java/nokogiri/HtmlElementDescription.java rename to ext/java/nokogiri/Html4ElementDescription.java index b96b7ab6e1..88da6237ab 100644 --- a/ext/java/nokogiri/HtmlElementDescription.java +++ b/ext/java/nokogiri/Html4ElementDescription.java @@ -16,12 +16,12 @@ import org.jruby.runtime.builtin.IRubyObject; /** - * Class for Nokogiri::HTML::ElementDescription. + * Class for Nokogiri::HTML4::ElementDescription. * * @author Patrick Mahoney */ -@JRubyClass(name = "Nokogiri::HTML::ElementDescription") -public class HtmlElementDescription extends RubyObject +@JRubyClass(name = "Nokogiri::HTML4::ElementDescription") +public class Html4ElementDescription extends RubyObject { /** @@ -38,7 +38,7 @@ public class HtmlElementDescription extends RubyObject protected HTMLElements.Element element; public - HtmlElementDescription(Ruby runtime, RubyClass rubyClass) + Html4ElementDescription(Ruby runtime, RubyClass rubyClass) { super(runtime, rubyClass); } @@ -89,8 +89,8 @@ public class HtmlElementDescription extends RubyObject return context.nil; } - HtmlElementDescription desc = - new HtmlElementDescription(context.getRuntime(), (RubyClass)klazz); + Html4ElementDescription desc = + new Html4ElementDescription(context.getRuntime(), (RubyClass)klazz); desc.element = elem; return desc; } diff --git a/ext/java/nokogiri/HtmlEntityLookup.java b/ext/java/nokogiri/Html4EntityLookup.java similarity index 79% rename from ext/java/nokogiri/HtmlEntityLookup.java rename to ext/java/nokogiri/Html4EntityLookup.java index 2388cbf369..e33f5b2273 100644 --- a/ext/java/nokogiri/HtmlEntityLookup.java +++ b/ext/java/nokogiri/Html4EntityLookup.java @@ -12,16 +12,16 @@ import org.jruby.runtime.builtin.IRubyObject; /** - * Class for Nokogiri::HTML::EntityLookup. + * Class for Nokogiri::HTML4::EntityLookup. * * @author Patrick Mahoney */ -@JRubyClass(name = "Nokogiri::HTML::EntityLookup") -public class HtmlEntityLookup extends RubyObject +@JRubyClass(name = "Nokogiri::HTML4::EntityLookup") +public class Html4EntityLookup extends RubyObject { public - HtmlEntityLookup(Ruby runtime, RubyClass rubyClass) + Html4EntityLookup(Ruby runtime, RubyClass rubyClass) { super(runtime, rubyClass); } @@ -41,7 +41,7 @@ public class HtmlEntityLookup extends RubyObject if (val == -1) { return ruby.getNil(); } IRubyObject edClass = - ruby.getClassFromPath("Nokogiri::HTML::EntityDescription"); + ruby.getClassFromPath("Nokogiri::HTML4::EntityDescription"); IRubyObject edObj = invoke(context, edClass, "new", ruby.newFixnum(val), ruby.newString(name), ruby.newString(name + " entity")); diff --git a/ext/java/nokogiri/HtmlSaxParserContext.java b/ext/java/nokogiri/Html4SaxParserContext.java similarity index 88% rename from ext/java/nokogiri/HtmlSaxParserContext.java rename to ext/java/nokogiri/Html4SaxParserContext.java index 96896bdfe9..b03217ec6e 100644 --- a/ext/java/nokogiri/HtmlSaxParserContext.java +++ b/ext/java/nokogiri/Html4SaxParserContext.java @@ -24,27 +24,27 @@ import static nokogiri.internals.NokogiriHelpers.rubyStringToString; /** - * Class for Nokogiri::HTML::SAX::ParserContext. + * Class for Nokogiri::HTML4::SAX::ParserContext. * * @author serabe * @author Patrick Mahoney * @author Yoko Harada */ -@JRubyClass(name = "Nokogiri::HTML::SAX::ParserContext", parent = "Nokogiri::XML::SAX::ParserContext") -public class HtmlSaxParserContext extends XmlSaxParserContext +@JRubyClass(name = "Nokogiri::HTML4::SAX::ParserContext", parent = "Nokogiri::XML::SAX::ParserContext") +public class Html4SaxParserContext extends XmlSaxParserContext { - static HtmlSaxParserContext + static Html4SaxParserContext newInstance(final Ruby runtime, final RubyClass klazz) { - HtmlSaxParserContext instance = new HtmlSaxParserContext(runtime, klazz); + Html4SaxParserContext instance = new Html4SaxParserContext(runtime, klazz); instance.initialize(runtime); return instance; } public - HtmlSaxParserContext(Ruby ruby, RubyClass rubyClass) + Html4SaxParserContext(Ruby ruby, RubyClass rubyClass) { super(ruby, rubyClass); } @@ -68,7 +68,7 @@ public class HtmlSaxParserContext extends XmlSaxParserContext return parser; } catch (SAXException ex) { throw new SAXException( - "Problem while creating HTML SAX Parser: " + ex.toString()); + "Problem while creating HTML4 SAX Parser: " + ex.toString()); } } @@ -79,7 +79,7 @@ public class HtmlSaxParserContext extends XmlSaxParserContext IRubyObject data, IRubyObject encoding) { - HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klazz); + Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klazz); String javaEncoding = findEncodingName(context, encoding); if (javaEncoding != null) { CharSequence input = applyEncoding(rubyStringToString(data.convertToString()), javaEncoding); @@ -231,7 +231,7 @@ static EncodingType get(final int ordinal) IRubyObject data, IRubyObject encoding) { - HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass); + Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass); ctx.setInputSourceFile(context, data); String javaEncoding = findEncodingName(context, encoding); if (javaEncoding != null) { @@ -247,7 +247,7 @@ static EncodingType get(final int ordinal) IRubyObject data, IRubyObject encoding) { - HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(context.runtime, (RubyClass) klass); + Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(context.runtime, (RubyClass) klass); ctx.setIOInputSource(context, data, context.nil); String javaEncoding = findEncodingName(context, encoding); if (javaEncoding != null) { @@ -258,12 +258,12 @@ static EncodingType get(final int ordinal) /** * Create a new parser context that will read from a raw input stream. - * Meant to be run in a separate thread by HtmlSaxPushParser. + * Meant to be run in a separate thread by Html4SaxPushParser. */ - static HtmlSaxParserContext + static Html4SaxParserContext parse_stream(final Ruby runtime, RubyClass klass, InputStream stream) { - HtmlSaxParserContext ctx = HtmlSaxParserContext.newInstance(runtime, klass); + Html4SaxParserContext ctx = Html4SaxParserContext.newInstance(runtime, klass); ctx.setInputSource(stream); return ctx; } diff --git a/ext/java/nokogiri/HtmlSaxPushParser.java b/ext/java/nokogiri/Html4SaxPushParser.java similarity index 87% rename from ext/java/nokogiri/HtmlSaxPushParser.java rename to ext/java/nokogiri/Html4SaxPushParser.java index b056d1dbfe..d9aa6959e6 100644 --- a/ext/java/nokogiri/HtmlSaxPushParser.java +++ b/ext/java/nokogiri/Html4SaxPushParser.java @@ -27,13 +27,13 @@ import org.jruby.runtime.builtin.IRubyObject; /** - * Class for Nokogiri::HTML::SAX::PushParser + * Class for Nokogiri::HTML4::SAX::PushParser * * @author * @author Piotr Szmielew - based on Nokogiri::XML::SAX::PushParser */ -@JRubyClass(name = "Nokogiri::HTML::SAX::PushParser") -public class HtmlSaxPushParser extends RubyObject +@JRubyClass(name = "Nokogiri::HTML4::SAX::PushParser") +public class Html4SaxPushParser extends RubyObject { ParserContext.Options options; IRubyObject saxParser; @@ -41,11 +41,11 @@ public class HtmlSaxPushParser extends RubyObject NokogiriBlockingQueueInputStream stream; private ParserTask parserTask = null; - private FutureTask futureTask = null; + private FutureTask futureTask = null; private ExecutorService executor = null; public - HtmlSaxPushParser(Ruby ruby, RubyClass rubyClass) + Html4SaxPushParser(Ruby ruby, RubyClass rubyClass) { super(ruby, rubyClass); } @@ -111,7 +111,7 @@ public class HtmlSaxPushParser extends RubyObject final ByteArrayInputStream data = NokogiriHelpers.stringBytesToStream(chunk); if (data == null) { terminateTask(context.runtime); - throw XmlSyntaxError.createHTMLSyntaxError(context.runtime).toThrowable(); // Nokogiri::HTML::SyntaxError + throw XmlSyntaxError.createHTMLSyntaxError(context.runtime).toThrowable(); // Nokogiri::HTML4::SyntaxError } int errorCount0 = parserTask.getErrorCount(); @@ -149,12 +149,12 @@ public class HtmlSaxPushParser extends RubyObject assert saxParser != null : "saxParser null"; parserTask = new ParserTask(context, saxParser, stream); - futureTask = new FutureTask((Callable) parserTask); + futureTask = new FutureTask((Callable) parserTask); executor = Executors.newSingleThreadExecutor(new ThreadFactory() { @Override public Thread newThread(Runnable r) { Thread t = new Thread(r); - t.setName("HtmlSaxPushParser"); + t.setName("Html4SaxPushParser"); t.setDaemon(true); return t; } @@ -187,14 +187,14 @@ public Thread newThread(Runnable r) { futureTask = null; } - private static HtmlSaxParserContext + private static Html4SaxParserContext parse(final Ruby runtime, final InputStream stream) { - RubyClass klazz = getNokogiriClass(runtime, "Nokogiri::HTML::SAX::ParserContext"); - return HtmlSaxParserContext.parse_stream(runtime, klazz, stream); + RubyClass klazz = getNokogiriClass(runtime, "Nokogiri::HTML4::SAX::ParserContext"); + return Html4SaxParserContext.parse_stream(runtime, klazz, stream); } - static class ParserTask extends XmlSaxPushParser.ParserTask /* */ + static class ParserTask extends XmlSaxPushParser.ParserTask /* */ { private @@ -204,10 +204,10 @@ static class ParserTask extends XmlSaxPushParser.ParserTask /* nokogiriClassCache = new HashMap(); nokogiriClassCache.put("Nokogiri::EncodingHandler", (RubyClass)ruby.getClassFromPath("Nokogiri::EncodingHandler")); - nokogiriClassCache.put("Nokogiri::HTML::Document", (RubyClass)ruby.getClassFromPath("Nokogiri::HTML::Document")); - nokogiriClassCache.put("Nokogiri::HTML::ElementDescription", - (RubyClass)ruby.getClassFromPath("Nokogiri::HTML::ElementDescription")); + nokogiriClassCache.put("Nokogiri::HTML4::Document", (RubyClass)ruby.getClassFromPath("Nokogiri::HTML4::Document")); + nokogiriClassCache.put("Nokogiri::HTML4::ElementDescription", + (RubyClass)ruby.getClassFromPath("Nokogiri::HTML4::ElementDescription")); nokogiriClassCache.put("Nokogiri::XML::Attr", (RubyClass)ruby.getClassFromPath("Nokogiri::XML::Attr")); nokogiriClassCache.put("Nokogiri::XML::Document", (RubyClass)ruby.getClassFromPath("Nokogiri::XML::Document")); nokogiriClassCache.put("Nokogiri::XML::DocumentFragment", @@ -81,7 +81,7 @@ public class NokogiriService implements BasicLibraryService RubyModule nokogiri = ruby.defineModule("Nokogiri"); RubyModule xmlModule = nokogiri.defineModuleUnder("XML"); RubyModule xmlSaxModule = xmlModule.defineModuleUnder("SAX"); - RubyModule htmlModule = nokogiri.defineModuleUnder("HTML"); + RubyModule htmlModule = nokogiri.defineModuleUnder("HTML4"); RubyModule htmlSaxModule = htmlModule.defineModuleUnder("SAX"); RubyModule xsltModule = nokogiri.defineModuleUnder("XSLT"); @@ -201,11 +201,11 @@ public class NokogiriService implements BasicLibraryService { RubyClass htmlElemDesc = htmlModule.defineClassUnder("ElementDescription", ruby.getObject(), HTML_ELEMENT_DESCRIPTION_ALLOCATOR); - htmlElemDesc.defineAnnotatedMethods(HtmlElementDescription.class); + htmlElemDesc.defineAnnotatedMethods(Html4ElementDescription.class); RubyClass htmlEntityLookup = htmlModule.defineClassUnder("EntityLookup", ruby.getObject(), HTML_ENTITY_LOOKUP_ALLOCATOR); - htmlEntityLookup.defineAnnotatedMethods(HtmlEntityLookup.class); + htmlEntityLookup.defineAnnotatedMethods(Html4EntityLookup.class); } private void @@ -216,7 +216,7 @@ public class NokogiriService implements BasicLibraryService //RubyModule htmlDoc = html.defineOrGetClassUnder("Document", document); RubyModule htmlDocument = htmlModule.defineClassUnder("Document", xmlDocument, HTML_DOCUMENT_ALLOCATOR); - htmlDocument.defineAnnotatedMethods(HtmlDocument.class); + htmlDocument.defineAnnotatedMethods(Html4Document.class); } private void @@ -231,11 +231,11 @@ public class NokogiriService implements BasicLibraryService RubyClass htmlSaxPushParser = htmlSaxModule.defineClassUnder("PushParser", ruby.getObject(), HTML_SAXPUSHPARSER_ALLOCATOR); - htmlSaxPushParser.defineAnnotatedMethods(HtmlSaxPushParser.class); + htmlSaxPushParser.defineAnnotatedMethods(Html4SaxPushParser.class); RubyClass htmlSaxParserContext = htmlSaxModule.defineClassUnder("ParserContext", xmlSaxParserContext, HTML_SAXPARSER_CONTEXT_ALLOCATOR); - htmlSaxParserContext.defineAnnotatedMethods(HtmlSaxParserContext.class); + htmlSaxParserContext.defineAnnotatedMethods(Html4SaxParserContext.class); } private void @@ -255,30 +255,30 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { public static final ObjectAllocator HTML_DOCUMENT_ALLOCATOR = new ObjectAllocator() { - private HtmlDocument htmlDocument = null; + private Html4Document htmlDocument = null; public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - if (htmlDocument == null) { htmlDocument = new HtmlDocument(runtime, klazz); } + if (htmlDocument == null) { htmlDocument = new Html4Document(runtime, klazz); } try { - HtmlDocument clone = (HtmlDocument) htmlDocument.clone(); + Html4Document clone = (Html4Document) htmlDocument.clone(); clone.setMetaClass(klazz); return clone; } catch (CloneNotSupportedException e) { - return new HtmlDocument(runtime, klazz); + return new Html4Document(runtime, klazz); } } }; private static final ObjectAllocator HTML_SAXPARSER_CONTEXT_ALLOCATOR = new ObjectAllocator() { - private HtmlSaxParserContext htmlSaxParserContext = null; + private Html4SaxParserContext htmlSaxParserContext = null; public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - if (htmlSaxParserContext == null) { htmlSaxParserContext = new HtmlSaxParserContext(runtime, klazz); } + if (htmlSaxParserContext == null) { htmlSaxParserContext = new Html4SaxParserContext(runtime, klazz); } try { - HtmlSaxParserContext clone = (HtmlSaxParserContext) htmlSaxParserContext.clone(); + Html4SaxParserContext clone = (Html4SaxParserContext) htmlSaxParserContext.clone(); clone.setMetaClass(klazz); return clone; } catch (CloneNotSupportedException e) { - return new HtmlSaxParserContext(runtime, klazz); + return new Html4SaxParserContext(runtime, klazz); } } }; @@ -287,7 +287,7 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { new ObjectAllocator() { public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new HtmlElementDescription(runtime, klazz); + return new Html4ElementDescription(runtime, klazz); } }; @@ -295,7 +295,7 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { new ObjectAllocator() { public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new HtmlEntityLookup(runtime, klazz); + return new Html4EntityLookup(runtime, klazz); } }; @@ -571,7 +571,7 @@ public IRubyObject allocate(Ruby runtime, RubyClass klazz) { private static final ObjectAllocator HTML_SAXPUSHPARSER_ALLOCATOR = new ObjectAllocator() { public IRubyObject allocate(Ruby runtime, RubyClass klazz) { - return new HtmlSaxPushParser(runtime, klazz); + return new Html4SaxPushParser(runtime, klazz); } }; diff --git a/ext/java/nokogiri/XmlAttr.java b/ext/java/nokogiri/XmlAttr.java index 6335faa89f..6c0e96636e 100644 --- a/ext/java/nokogiri/XmlAttr.java +++ b/ext/java/nokogiri/XmlAttr.java @@ -117,7 +117,7 @@ public class XmlAttr extends XmlNode String attrName = ((Attr) node).getName(); if (attrName == null) { return context.nil; } - if (node.getNamespaceURI() != null && !(document(context.runtime) instanceof HtmlDocument)) { + if (node.getNamespaceURI() != null && !(document(context.runtime) instanceof Html4Document)) { attrName = NokogiriHelpers.getLocalPart(attrName); if (attrName == null) { return context.nil; } } @@ -137,7 +137,7 @@ public class XmlAttr extends XmlNode isHtml(ThreadContext context) { return document(context).getMetaClass().isKindOfModule(getNokogiriClass(context.getRuntime(), - "Nokogiri::HTML::Document")); + "Nokogiri::HTML4::Document")); } @Override diff --git a/ext/java/nokogiri/XmlDocument.java b/ext/java/nokogiri/XmlDocument.java index 0d859e730f..6a3ebeef3a 100644 --- a/ext/java/nokogiri/XmlDocument.java +++ b/ext/java/nokogiri/XmlDocument.java @@ -280,8 +280,8 @@ private static class DocumentBuilderFactoryHolder XmlDocument xmlDocument; try { Document docNode = createNewDocument(runtime); - if ("Nokogiri::HTML::Document".equals(((RubyClass)klazz).getName())) { - xmlDocument = new HtmlDocument(context.runtime, (RubyClass) klazz, docNode); + if ("Nokogiri::HTML4::Document".equals(((RubyClass)klazz).getName())) { + xmlDocument = new Html4Document(context.runtime, (RubyClass) klazz, docNode); } else { xmlDocument = new XmlDocument(context.runtime, (RubyClass) klazz, docNode); } diff --git a/ext/java/nokogiri/XmlNode.java b/ext/java/nokogiri/XmlNode.java index 9cab7c0823..d1ced80f74 100644 --- a/ext/java/nokogiri/XmlNode.java +++ b/ext/java/nokogiri/XmlNode.java @@ -645,7 +645,7 @@ public class XmlNode extends RubyObject final XmlDocument doc = document(context.runtime); for (int i = 0; i < nodeMap.getLength(); i++) { - if ((doc instanceof HtmlDocument) || !NokogiriHelpers.isNamespace(nodeMap.item(i))) { + if ((doc instanceof Html4Document) || !NokogiriHelpers.isNamespace(nodeMap.item(i))) { attr.append(getCachedNodeOrCreate(runtime, nodeMap.item(i))); } } @@ -811,8 +811,8 @@ public class XmlNode extends RubyObject XmlDocument document = document(runtime); if (document == null) { return context.nil; } - if (document instanceof HtmlDocument) { - klass = getNokogiriClass(runtime, "Nokogiri::HTML::Document"); + if (document instanceof Html4Document) { + klass = getNokogiriClass(runtime, "Nokogiri::HTML4::Document"); ctx = new HtmlDomParserContext(runtime, options); ((HtmlDomParserContext) ctx).enableDocumentFragment(); ctx.setStringInputSource(context, str, context.nil); @@ -824,7 +824,7 @@ public class XmlNode extends RubyObject // TODO: for some reason, document.getEncoding() can be null or nil (don't know why) // run `test_parse_with_unparented_html_text_context_node' few times to see this happen - if (document instanceof HtmlDocument && !(document.getEncoding() == null || document.getEncoding().isNil())) { + if (document instanceof Html4Document && !(document.getEncoding() == null || document.getEncoding().isNil())) { HtmlDomParserContext htmlCtx = (HtmlDomParserContext) ctx; htmlCtx.setEncoding(document.getEncoding().asJavaString()); } @@ -1148,7 +1148,7 @@ public class XmlNode extends RubyObject namespace(ThreadContext context) { final XmlDocument doc = document(context.runtime); - if (doc instanceof HtmlDocument) { return context.nil; } + if (doc instanceof Html4Document) { return context.nil; } String namespaceURI = node.getNamespaceURI(); if (namespaceURI == null || namespaceURI.isEmpty()) { @@ -1183,7 +1183,7 @@ public class XmlNode extends RubyObject // updated. final XmlDocument doc = document(context.runtime); if (doc == null) { return context.runtime.newEmptyArray(); } - if (doc instanceof HtmlDocument) { return context.runtime.newEmptyArray(); } + if (doc instanceof Html4Document) { return context.runtime.newEmptyArray(); } List namespaces = doc.getNamespaceCache().get(node); return context.runtime.newArray((List) namespaces); @@ -1199,7 +1199,7 @@ public class XmlNode extends RubyObject { final XmlDocument doc = document(context.runtime); if (doc == null) { return context.runtime.newEmptyArray(); } - if (doc instanceof HtmlDocument) { return context.runtime.newEmptyArray(); } + if (doc instanceof Html4Document) { return context.runtime.newEmptyArray(); } Node previousNode; if (node.getNodeType() == Node.ELEMENT_NODE) { @@ -1335,7 +1335,7 @@ public class XmlNode extends RubyObject private boolean isHtmlDoc(ThreadContext context) { - return document(context).getMetaClass().isKindOfModule(getNokogiriClass(context.runtime, "Nokogiri::HTML::Document")); + return document(context).getMetaClass().isKindOfModule(getNokogiriClass(context.runtime, "Nokogiri::HTML4::Document")); } private boolean @@ -1549,7 +1549,7 @@ public class XmlNode extends RubyObject type = "COMMENT_NODE"; break; case Node.DOCUMENT_NODE: - if (this instanceof HtmlDocument) { + if (this instanceof Html4Document) { type = "HTML_DOCUMENT_NODE"; } else { type = "DOCUMENT_NODE"; diff --git a/ext/java/nokogiri/XmlSaxPushParser.java b/ext/java/nokogiri/XmlSaxPushParser.java index 8fa419b5e5..019965df8c 100644 --- a/ext/java/nokogiri/XmlSaxPushParser.java +++ b/ext/java/nokogiri/XmlSaxPushParser.java @@ -211,7 +211,7 @@ public Thread newThread(Runnable r) { futureTask = null; } - // SHARED for HtmlSaxPushParser + // SHARED for Html4SaxPushParser static void terminateExecution(final ExecutorService executor, final NokogiriBlockingQueueInputStream stream, final FutureTask futureTask) @@ -248,7 +248,7 @@ static class ParserTask extends ParserContext.ParserTask this(context, handler, parse(context.runtime, stream), stream); } - // IMPL with HtmlSaxPushParser + // IMPL with Html4SaxPushParser protected ParserTask(ThreadContext context, IRubyObject handler, XmlSaxParserContext parser, InputStream stream) { diff --git a/ext/java/nokogiri/XmlSyntaxError.java b/ext/java/nokogiri/XmlSyntaxError.java index f87c7d2e18..2555d4ca42 100644 --- a/ext/java/nokogiri/XmlSyntaxError.java +++ b/ext/java/nokogiri/XmlSyntaxError.java @@ -63,7 +63,7 @@ public class XmlSyntaxError extends RubyException public static XmlSyntaxError createHTMLSyntaxError(final Ruby runtime) { - RubyClass klazz = (RubyClass) runtime.getClassFromPath("Nokogiri::HTML::SyntaxError"); + RubyClass klazz = (RubyClass) runtime.getClassFromPath("Nokogiri::HTML4::SyntaxError"); return new XmlSyntaxError(runtime, klazz); } diff --git a/ext/java/nokogiri/XsltStylesheet.java b/ext/java/nokogiri/XsltStylesheet.java index 9bb322c46c..d5329f120b 100644 --- a/ext/java/nokogiri/XsltStylesheet.java +++ b/ext/java/nokogiri/XsltStylesheet.java @@ -286,7 +286,7 @@ public class XsltStylesheet extends RubyObject createDocumentFromDomResult(ThreadContext context, Ruby runtime, DOMResult domResult) { if ("html".equals(domResult.getNode().getFirstChild().getNodeName())) { - return new HtmlDocument(context.runtime, (Document) domResult.getNode()); + return new Html4Document(context.runtime, (Document) domResult.getNode()); } else { return new XmlDocument(context.runtime, (Document) domResult.getNode()); } @@ -322,7 +322,7 @@ public class XsltStylesheet extends RubyObject RubyClass parse_options = (RubyClass)runtime.getClassFromPath("Nokogiri::XML::ParseOptions"); if (htmlish) { args[3] = parse_options.getConstant("DEFAULT_HTML"); - RubyClass htmlDocumentClass = getNokogiriClass(runtime, "Nokogiri::HTML::Document"); + RubyClass htmlDocumentClass = getNokogiriClass(runtime, "Nokogiri::HTML4::Document"); return Helpers.invoke(context, htmlDocumentClass, "parse", args); } else { args[3] = parse_options.getConstant("DEFAULT_XML"); diff --git a/ext/java/nokogiri/internals/HtmlDomParserContext.java b/ext/java/nokogiri/internals/HtmlDomParserContext.java index 20200a929c..80c2f96ee9 100644 --- a/ext/java/nokogiri/internals/HtmlDomParserContext.java +++ b/ext/java/nokogiri/internals/HtmlDomParserContext.java @@ -4,7 +4,7 @@ import static nokogiri.internals.NokogiriHelpers.isNamespace; import static nokogiri.internals.NokogiriHelpers.stringOrNil; -import nokogiri.HtmlDocument; +import nokogiri.Html4Document; import nokogiri.NokogiriService; import nokogiri.XmlDocument; import nokogiri.XmlSyntaxError; @@ -28,7 +28,7 @@ import org.w3c.dom.NodeList; /** - * Parser for HtmlDocument. This class actually parses HtmlDocument using NekoHtml. + * Parser for Html4Document. This class actually parses Html4Document using NekoHtml. * * @author sergio * @author Patrick Mahoney @@ -115,12 +115,12 @@ public class HtmlDomParserContext extends XmlDomParserContext protected XmlDocument wrapDocument(ThreadContext context, RubyClass klass, Document document) { - HtmlDocument htmlDocument = new HtmlDocument(context.runtime, klass, document); + Html4Document htmlDocument = new Html4Document(context.runtime, klass, document); htmlDocument.setDocumentNode(context.runtime, document); Helpers.invoke(context, htmlDocument, "initialize"); if (ruby_encoding.isNil()) { - // ruby_encoding might have detected by HtmlDocument::EncodingReader + // ruby_encoding might have detected by Html4Document::EncodingReader if (detected_encoding != null && !detected_encoding.isNil()) { ruby_encoding = detected_encoding; } else { diff --git a/ext/java/nokogiri/internals/NokogiriHandler.java b/ext/java/nokogiri/internals/NokogiriHandler.java index 897a6dcf13..86a39c1204 100644 --- a/ext/java/nokogiri/internals/NokogiriHandler.java +++ b/ext/java/nokogiri/internals/NokogiriHandler.java @@ -59,7 +59,7 @@ public class NokogiriHandler extends DefaultHandler2 implements XmlDeclHandler this.object = object; charactersBuilder = new StringBuilder(); String objectName = object.getMetaClass().getName(); - if ("Nokogiri::HTML::SAX::Parser".equals(objectName)) { needEmptyAttrCheck = true; } + if ("Nokogiri::HTML4::SAX::Parser".equals(objectName)) { needEmptyAttrCheck = true; } } @Override diff --git a/ext/java/nokogiri/internals/NokogiriHelpers.java b/ext/java/nokogiri/internals/NokogiriHelpers.java index 1625c9f8b3..c506bba394 100644 --- a/ext/java/nokogiri/internals/NokogiriHelpers.java +++ b/ext/java/nokogiri/internals/NokogiriHelpers.java @@ -25,7 +25,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; -import nokogiri.HtmlDocument; +import nokogiri.Html4Document; import nokogiri.NokogiriService; import nokogiri.XmlAttr; import nokogiri.XmlCdata; @@ -89,7 +89,7 @@ public class NokogiriHelpers if (node == null) { return runtime.getNil(); } if (node.getNodeType() == Node.ATTRIBUTE_NODE && isNamespace(node.getNodeName())) { XmlDocument xmlDocument = (XmlDocument) node.getOwnerDocument().getUserData(CACHED_NODE); - if (!(xmlDocument instanceof HtmlDocument)) { + if (!(xmlDocument instanceof Html4Document)) { String prefix = getLocalNameForNamespace(((Attr) node).getName(), null); String href = ((Attr) node).getValue(); XmlNamespace xmlNamespace = xmlDocument.getNamespaceCache().get(prefix, href); @@ -723,8 +723,8 @@ public class NokogiriHelpers public static CharSequence convertEncodingByNKFIfNecessary(ThreadContext context, XmlDocument doc, CharSequence str) { - if (!(doc instanceof HtmlDocument)) { return str; } - String parsed_encoding = ((HtmlDocument)doc).getPraedEncoding(); + if (!(doc instanceof Html4Document)) { return str; } + String parsed_encoding = ((Html4Document)doc).getPraedEncoding(); if (parsed_encoding == null) { return str; } String ruby_encoding = rubyStringToString(doc.getEncoding()); if (ruby_encoding == null) { return str; } diff --git a/ext/nokogiri/gumbo.c b/ext/nokogiri/gumbo.c index 69823927fa..14fb590ef3 100644 --- a/ext/nokogiri/gumbo.c +++ b/ext/nokogiri/gumbo.c @@ -342,7 +342,9 @@ parse_cleanup(VALUE parse_args) static VALUE parse_continue(VALUE parse_args); -// Parse a string using gumbo_parse into a Nokogiri document +/* + * @!visibility protected + */ static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) { @@ -431,6 +433,9 @@ extract_xml_node(VALUE node) static VALUE fragment_continue(VALUE parse_args); +/* + * @!visibility protected + */ static VALUE fragment( VALUE self, @@ -591,7 +596,7 @@ void noko_init_gumbo() { // Class constants. - cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtmlDocument); + cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtml4Document); rb_gc_register_mark_object(cNokogiriHtml5Document); // Interned symbols. diff --git a/ext/nokogiri/html_document.c b/ext/nokogiri/html4_document.c similarity index 88% rename from ext/nokogiri/html_document.c rename to ext/nokogiri/html4_document.c index 7462f854ef..9e9a016957 100644 --- a/ext/nokogiri/html_document.c +++ b/ext/nokogiri/html4_document.c @@ -1,6 +1,6 @@ #include -VALUE cNokogiriHtmlDocument ; +VALUE cNokogiriHtml4Document ; static ID id_encoding_found; static ID id_to_s; @@ -34,7 +34,7 @@ rb_html_document_s_new(int argc, VALUE *argv, VALUE klass) * read_io(io, url, encoding, options) * * Read the HTML document from +io+ with given +url+, +encoding+, - * and +options+. See Nokogiri::HTML.parse + * and +options+. See Nokogiri::HTML4.parse */ static VALUE rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) @@ -92,7 +92,7 @@ rb_html_document_s_read_io(VALUE klass, VALUE rb_io, VALUE rb_url, VALUE rb_enco * read_memory(string, url, encoding, options) * * Read the HTML document contained in +string+ with given +url+, +encoding+, - * and +options+. See Nokogiri::HTML.parse + * and +options+. See Nokogiri::HTML4.parse */ static VALUE rb_html_document_s_read_memory(VALUE klass, VALUE rb_html, VALUE rb_url, VALUE rb_encoding, VALUE rb_options) @@ -153,13 +153,13 @@ void noko_init_html_document() { assert(cNokogiriXmlDocument); - cNokogiriHtmlDocument = rb_define_class_under(mNokogiriHtml, "Document", cNokogiriXmlDocument); + cNokogiriHtml4Document = rb_define_class_under(mNokogiriHtml4, "Document", cNokogiriXmlDocument); - rb_define_singleton_method(cNokogiriHtmlDocument, "read_memory", rb_html_document_s_read_memory, 4); - rb_define_singleton_method(cNokogiriHtmlDocument, "read_io", rb_html_document_s_read_io, 4); - rb_define_singleton_method(cNokogiriHtmlDocument, "new", rb_html_document_s_new, -1); + rb_define_singleton_method(cNokogiriHtml4Document, "read_memory", rb_html_document_s_read_memory, 4); + rb_define_singleton_method(cNokogiriHtml4Document, "read_io", rb_html_document_s_read_io, 4); + rb_define_singleton_method(cNokogiriHtml4Document, "new", rb_html_document_s_new, -1); - rb_define_method(cNokogiriHtmlDocument, "type", rb_html_document_type, 0); + rb_define_method(cNokogiriHtml4Document, "type", rb_html_document_type, 0); id_encoding_found = rb_intern("encoding_found"); id_to_s = rb_intern("to_s"); diff --git a/ext/nokogiri/html_element_description.c b/ext/nokogiri/html4_element_description.c similarity index 76% rename from ext/nokogiri/html_element_description.c rename to ext/nokogiri/html4_element_description.c index e4b044d3b8..3344772f3f 100644 --- a/ext/nokogiri/html_element_description.c +++ b/ext/nokogiri/html4_element_description.c @@ -1,6 +1,6 @@ #include -VALUE cNokogiriHtmlElementDescription ; +VALUE cNokogiriHtml4ElementDescription ; /* * call-seq: @@ -272,23 +272,23 @@ get_description(VALUE klass, VALUE tag_name) void noko_init_html_element_description() { - cNokogiriHtmlElementDescription = rb_define_class_under(mNokogiriHtml, "ElementDescription", rb_cObject); - - rb_undef_alloc_func(cNokogiriHtmlElementDescription); - - rb_define_singleton_method(cNokogiriHtmlElementDescription, "[]", get_description, 1); - - rb_define_method(cNokogiriHtmlElementDescription, "name", name, 0); - rb_define_method(cNokogiriHtmlElementDescription, "implied_start_tag?", implied_start_tag_eh, 0); - rb_define_method(cNokogiriHtmlElementDescription, "implied_end_tag?", implied_end_tag_eh, 0); - rb_define_method(cNokogiriHtmlElementDescription, "save_end_tag?", save_end_tag_eh, 0); - rb_define_method(cNokogiriHtmlElementDescription, "empty?", empty_eh, 0); - rb_define_method(cNokogiriHtmlElementDescription, "deprecated?", deprecated_eh, 0); - rb_define_method(cNokogiriHtmlElementDescription, "inline?", inline_eh, 0); - rb_define_method(cNokogiriHtmlElementDescription, "description", description, 0); - rb_define_method(cNokogiriHtmlElementDescription, "sub_elements", sub_elements, 0); - rb_define_method(cNokogiriHtmlElementDescription, "default_sub_element", default_sub_element, 0); - rb_define_method(cNokogiriHtmlElementDescription, "optional_attributes", optional_attributes, 0); - rb_define_method(cNokogiriHtmlElementDescription, "deprecated_attributes", deprecated_attributes, 0); - rb_define_method(cNokogiriHtmlElementDescription, "required_attributes", required_attributes, 0); + cNokogiriHtml4ElementDescription = rb_define_class_under(mNokogiriHtml4, "ElementDescription", rb_cObject); + + rb_undef_alloc_func(cNokogiriHtml4ElementDescription); + + rb_define_singleton_method(cNokogiriHtml4ElementDescription, "[]", get_description, 1); + + rb_define_method(cNokogiriHtml4ElementDescription, "name", name, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "implied_start_tag?", implied_start_tag_eh, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "implied_end_tag?", implied_end_tag_eh, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "save_end_tag?", save_end_tag_eh, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "empty?", empty_eh, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "deprecated?", deprecated_eh, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "inline?", inline_eh, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "description", description, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "sub_elements", sub_elements, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "default_sub_element", default_sub_element, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "optional_attributes", optional_attributes, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "deprecated_attributes", deprecated_attributes, 0); + rb_define_method(cNokogiriHtml4ElementDescription, "required_attributes", required_attributes, 0); } diff --git a/ext/nokogiri/html_entity_lookup.c b/ext/nokogiri/html4_entity_lookup.c similarity index 60% rename from ext/nokogiri/html_entity_lookup.c rename to ext/nokogiri/html4_entity_lookup.c index a63f58a528..ee1589cb43 100644 --- a/ext/nokogiri/html_entity_lookup.c +++ b/ext/nokogiri/html4_entity_lookup.c @@ -1,17 +1,17 @@ #include -static VALUE cNokogiriHtmlEntityLookup; +static VALUE cNokogiriHtml4EntityLookup; /* * call-seq: * get(key) * - * Get the HTML::EntityDescription for +key+ + * Get the HTML4::EntityDescription for +key+ */ static VALUE get(VALUE _, VALUE rb_entity_name) { - VALUE cNokogiriHtmlEntityDescription; + VALUE cNokogiriHtml4EntityDescription; const htmlEntityDesc *c_entity_desc; VALUE rb_constructor_args[3]; @@ -24,14 +24,14 @@ get(VALUE _, VALUE rb_entity_name) rb_constructor_args[1] = NOKOGIRI_STR_NEW2(c_entity_desc->name); rb_constructor_args[2] = NOKOGIRI_STR_NEW2(c_entity_desc->desc); - cNokogiriHtmlEntityDescription = rb_const_get_at(mNokogiriHtml, rb_intern("EntityDescription")); - return rb_class_new_instance(3, rb_constructor_args, cNokogiriHtmlEntityDescription); + cNokogiriHtml4EntityDescription = rb_const_get_at(mNokogiriHtml4, rb_intern("EntityDescription")); + return rb_class_new_instance(3, rb_constructor_args, cNokogiriHtml4EntityDescription); } void noko_init_html_entity_lookup() { - cNokogiriHtmlEntityLookup = rb_define_class_under(mNokogiriHtml, "EntityLookup", rb_cObject); + cNokogiriHtml4EntityLookup = rb_define_class_under(mNokogiriHtml4, "EntityLookup", rb_cObject); - rb_define_method(cNokogiriHtmlEntityLookup, "get", get, 1); + rb_define_method(cNokogiriHtml4EntityLookup, "get", get, 1); } diff --git a/ext/nokogiri/html_sax_parser_context.c b/ext/nokogiri/html4_sax_parser_context.c similarity index 86% rename from ext/nokogiri/html_sax_parser_context.c rename to ext/nokogiri/html4_sax_parser_context.c index eb3361b296..6462442eec 100644 --- a/ext/nokogiri/html_sax_parser_context.c +++ b/ext/nokogiri/html4_sax_parser_context.c @@ -1,6 +1,6 @@ #include -VALUE cNokogiriHtmlSaxParserContext ; +VALUE cNokogiriHtml4SaxParserContext ; static void deallocate(xmlParserCtxtPtr ctxt) @@ -110,10 +110,10 @@ void noko_init_html_sax_parser_context() { assert(cNokogiriXmlSaxParserContext); - cNokogiriHtmlSaxParserContext = rb_define_class_under(mNokogiriHtmlSax, "ParserContext", cNokogiriXmlSaxParserContext); + cNokogiriHtml4SaxParserContext = rb_define_class_under(mNokogiriHtml4Sax, "ParserContext", cNokogiriXmlSaxParserContext); - rb_define_singleton_method(cNokogiriHtmlSaxParserContext, "memory", parse_memory, 2); - rb_define_singleton_method(cNokogiriHtmlSaxParserContext, "file", parse_file, 2); + rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "memory", parse_memory, 2); + rb_define_singleton_method(cNokogiriHtml4SaxParserContext, "file", parse_file, 2); - rb_define_method(cNokogiriHtmlSaxParserContext, "parse_with", parse_with, 1); + rb_define_method(cNokogiriHtml4SaxParserContext, "parse_with", parse_with, 1); } diff --git a/ext/nokogiri/html_sax_push_parser.c b/ext/nokogiri/html4_sax_push_parser.c similarity index 85% rename from ext/nokogiri/html_sax_push_parser.c rename to ext/nokogiri/html4_sax_push_parser.c index 30f3e18465..9dc4a8c2c2 100644 --- a/ext/nokogiri/html_sax_push_parser.c +++ b/ext/nokogiri/html4_sax_push_parser.c @@ -1,6 +1,6 @@ #include -VALUE cNokogiriHtmlSaxPushParser; +VALUE cNokogiriHtml4SaxPushParser; /* * call-seq: @@ -88,8 +88,8 @@ void noko_init_html_sax_push_parser() { assert(cNokogiriXmlSaxPushParser); - cNokogiriHtmlSaxPushParser = rb_define_class_under(mNokogiriHtmlSax, "PushParser", cNokogiriXmlSaxPushParser); + cNokogiriHtml4SaxPushParser = rb_define_class_under(mNokogiriHtml4Sax, "PushParser", cNokogiriXmlSaxPushParser); - rb_define_private_method(cNokogiriHtmlSaxPushParser, "initialize_native", initialize_native, 3); - rb_define_private_method(cNokogiriHtmlSaxPushParser, "native_write", native_write, 2); + rb_define_private_method(cNokogiriHtml4SaxPushParser, "initialize_native", initialize_native, 3); + rb_define_private_method(cNokogiriHtml4SaxPushParser, "native_write", native_write, 2); } diff --git a/ext/nokogiri/nokogiri.c b/ext/nokogiri/nokogiri.c index f16f84267d..80dab3ca71 100644 --- a/ext/nokogiri/nokogiri.c +++ b/ext/nokogiri/nokogiri.c @@ -2,8 +2,8 @@ VALUE mNokogiri ; VALUE mNokogiriGumbo ; -VALUE mNokogiriHtml ; -VALUE mNokogiriHtmlSax ; +VALUE mNokogiriHtml4 ; +VALUE mNokogiriHtml4Sax ; VALUE mNokogiriHtml5 ; VALUE mNokogiriXml ; VALUE mNokogiriXmlSax ; @@ -156,8 +156,8 @@ Init_nokogiri() { mNokogiri = rb_define_module("Nokogiri"); mNokogiriGumbo = rb_define_module_under(mNokogiri, "Gumbo"); - mNokogiriHtml = rb_define_module_under(mNokogiri, "HTML"); - mNokogiriHtmlSax = rb_define_module_under(mNokogiriHtml, "SAX"); + mNokogiriHtml4 = rb_define_module_under(mNokogiri, "HTML4"); + mNokogiriHtml4Sax = rb_define_module_under(mNokogiriHtml4, "SAX"); mNokogiriHtml5 = rb_define_module_under(mNokogiri, "HTML5"); mNokogiriXml = rb_define_module_under(mNokogiri, "XML"); mNokogiriXmlSax = rb_define_module_under(mNokogiriXml, "SAX"); diff --git a/ext/nokogiri/nokogiri.h b/ext/nokogiri/nokogiri.h index 16245aea64..ccae81460d 100644 --- a/ext/nokogiri/nokogiri.h +++ b/ext/nokogiri/nokogiri.h @@ -94,8 +94,8 @@ xmlNodePtr xmlLastElementChild(xmlNodePtr parent); NOKOPUBVAR VALUE mNokogiri ; NOKOPUBVAR VALUE mNokogiriGumbo ; -NOKOPUBVAR VALUE mNokogiriHtml ; -NOKOPUBVAR VALUE mNokogiriHtmlSax ; +NOKOPUBVAR VALUE mNokogiriHtml4 ; +NOKOPUBVAR VALUE mNokogiriHtml4Sax ; NOKOPUBVAR VALUE mNokogiriHtml5 ; NOKOPUBVAR VALUE mNokogiriXml ; NOKOPUBVAR VALUE mNokogiriXmlSax ; @@ -133,11 +133,11 @@ NOKOPUBVAR VALUE cNokogiriXmlXpathContext; NOKOPUBVAR VALUE cNokogiriXmlXpathSyntaxError; NOKOPUBVAR VALUE cNokogiriXsltStylesheet ; -NOKOPUBVAR VALUE cNokogiriHtmlDocument ; +NOKOPUBVAR VALUE cNokogiriHtml4Document ; +NOKOPUBVAR VALUE cNokogiriHtml4SaxPushParser ; +NOKOPUBVAR VALUE cNokogiriHtml4ElementDescription ; +NOKOPUBVAR VALUE cNokogiriHtml4SaxParserContext; NOKOPUBVAR VALUE cNokogiriHtml5Document ; -NOKOPUBVAR VALUE cNokogiriHtmlSaxPushParser ; -NOKOPUBVAR VALUE cNokogiriHtmlElementDescription ; -NOKOPUBVAR VALUE cNokogiriHtmlSaxParserContext; typedef struct _nokogiriTuple { VALUE doc; diff --git a/lib/nokogiri.rb b/lib/nokogiri.rb index 5921c75995..b396f47e16 100644 --- a/lib/nokogiri.rb +++ b/lib/nokogiri.rb @@ -2,40 +2,29 @@ # frozen_string_literal: true # Modify the PATH on windows so that the external DLLs will get loaded. -require 'rbconfig' +require "rbconfig" if defined?(RUBY_ENGINE) && RUBY_ENGINE == "jruby" - require 'nokogiri/jruby/dependencies' + require_relative "nokogiri/jruby/dependencies" end -require 'nokogiri/extension' - -require 'nokogiri/version' -require 'nokogiri/syntax_error' -require 'nokogiri/xml' -require 'nokogiri/xslt' -require 'nokogiri/html' -require 'nokogiri/decorators/slop' -require 'nokogiri/css' -require 'nokogiri/html/builder' - -require 'nokogiri/html5' if Nokogiri.uses_gumbo? +require_relative "nokogiri/extension" # Nokogiri parses and searches XML/HTML very quickly, and also has # correctly implemented CSS3 selector support as well as XPath 1.0 # support. # # Parsing a document returns either a Nokogiri::XML::Document, or a -# Nokogiri::HTML::Document depending on the kind of document you parse. +# Nokogiri::HTML4::Document depending on the kind of document you parse. # # Here is an example: # # require 'nokogiri' # require 'open-uri' # -# # Get a Nokogiri::HTML:Document for the page weโ€™re interested in... +# # Get a Nokogiri::HTML4::Document for the page weโ€™re interested in... # -# doc = Nokogiri::HTML(URI.open('http://www.google.com/search?q=tenderlove')) +# doc = Nokogiri::HTML4(URI.open('http://www.google.com/search?q=tenderlove')) # # # Do funky things with it using Nokogiri::XML::Node methods... # @@ -51,27 +40,27 @@ module Nokogiri class << self ### # Parse an HTML or XML document. +string+ contains the document. - def parse string, url = nil, encoding = nil, options = nil + def parse(string, url = nil, encoding = nil, options = nil) if string.respond_to?(:read) || /^\s*<(?:!DOCTYPE\s+)?html[\s>]/i === string[0, 512] # Expect an HTML indicator to appear within the first 512 # characters of a document. ( + # shouldn't be that long) - Nokogiri.HTML(string, url, encoding, + Nokogiri.HTML4(string, url, encoding, options || XML::ParseOptions::DEFAULT_HTML) else Nokogiri.XML(string, url, encoding, options || XML::ParseOptions::DEFAULT_XML) - end.tap { |doc| + end.tap do |doc| yield doc if block_given? - } + end end ### # Create a new Nokogiri::XML::DocumentFragment - def make input = nil, opts = {}, &blk + def make(input = nil, opts = {}, &blk) if input - Nokogiri::HTML.fragment(input).children.first + Nokogiri::HTML4.fragment(input).children.first else Nokogiri(&blk) end @@ -100,10 +89,10 @@ def install_default_aliases # Make sure to support some popular encoding aliases not known by # all iconv implementations. { - 'Windows-31J' => 'CP932', # Windows-31J is the IANA registered name of CP932. - }.each { |alias_name, name| + "Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932. + }.each do |alias_name, name| EncodingHandler.alias(name, alias_name) if EncodingHandler[alias_name].nil? - } + end end end @@ -111,15 +100,26 @@ def install_default_aliases end ### -# Parse a document contained in +args+. Nokogiri will try to guess what -# type of document you are attempting to parse. For more information, see -# Nokogiri.parse +# Parse a document contained in +args+. Nokogiri will try to guess what type of document you are +# attempting to parse. For more information, see Nokogiri.parse # -# To specify the type of document, use Nokogiri.XML or Nokogiri.HTML. +# To specify the type of document, use {Nokogiri.XML}, {Nokogiri.HTML4}, or {Nokogiri.HTML5}. def Nokogiri(*args, &block) if block_given? - Nokogiri::HTML::Builder.new(&block).doc.root + Nokogiri::HTML4::Builder.new(&block).doc.root else Nokogiri.parse(*args) end end + +require_relative "nokogiri/version" +require_relative "nokogiri/syntax_error" +require_relative "nokogiri/xml" +require_relative "nokogiri/xslt" +require_relative "nokogiri/html4" +require_relative "nokogiri/html" +require_relative "nokogiri/decorators/slop" +require_relative "nokogiri/css" +require_relative "nokogiri/html4/builder" + +require_relative "nokogiri/html5" if Nokogiri.uses_gumbo? diff --git a/lib/nokogiri/css.rb b/lib/nokogiri/css.rb index 786aa5e239..1b8fbdca24 100644 --- a/lib/nokogiri/css.rb +++ b/lib/nokogiri/css.rb @@ -1,28 +1,28 @@ # frozen_string_literal: true -require 'nokogiri/css/node' -require 'nokogiri/css/xpath_visitor' -x = $-w -$-w = false -require 'nokogiri/css/parser' -$-w = x - -require 'nokogiri/css/tokenizer' -require 'nokogiri/css/syntax_error' - module Nokogiri module CSS class << self ### # Parse this CSS selector in +selector+. Returns an AST. - def parse selector - Parser.new.parse selector + def parse(selector) + Parser.new.parse(selector) end ### # Get the XPath for +selector+. - def xpath_for selector, options={} - Parser.new(options[:ns] || {}).xpath_for selector, options + def xpath_for(selector, options = {}) + Parser.new(options[:ns] || {}).xpath_for(selector, options) end end end end + +require_relative "css/node" +require_relative "css/xpath_visitor" +x = $-w +$-w = false +require_relative "css/parser" +$-w = x + +require_relative "css/tokenizer" +require_relative "css/syntax_error" diff --git a/lib/nokogiri/css/parser.rb b/lib/nokogiri/css/parser.rb index 52b96826c9..29bdf59e79 100644 --- a/lib/nokogiri/css/parser.rb +++ b/lib/nokogiri/css/parser.rb @@ -8,7 +8,7 @@ require 'racc/parser.rb' -require 'nokogiri/css/parser_extras' +require_relative "parser_extras" module Nokogiri module CSS diff --git a/lib/nokogiri/css/parser.y b/lib/nokogiri/css/parser.y index 4ac8f9d69b..9ed97c8559 100644 --- a/lib/nokogiri/css/parser.y +++ b/lib/nokogiri/css/parser.y @@ -253,7 +253,7 @@ end ---- header -require 'nokogiri/css/parser_extras' +require_relative "parser_extras" ---- inner diff --git a/lib/nokogiri/css/syntax_error.rb b/lib/nokogiri/css/syntax_error.rb index db3e2e583e..7b48806468 100644 --- a/lib/nokogiri/css/syntax_error.rb +++ b/lib/nokogiri/css/syntax_error.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true -require 'nokogiri/syntax_error' +require_relative "../syntax_error" module Nokogiri module CSS class SyntaxError < ::Nokogiri::SyntaxError diff --git a/lib/nokogiri/extension.rb b/lib/nokogiri/extension.rb index b7de485bb5..73d9269b24 100644 --- a/lib/nokogiri/extension.rb +++ b/lib/nokogiri/extension.rb @@ -3,7 +3,7 @@ # load the C or Java extension begin ::RUBY_VERSION =~ /(\d+\.\d+)/ - require "nokogiri/#{Regexp.last_match(1)}/nokogiri" + require_relative "#{Regexp.last_match(1)}/nokogiri" rescue LoadError => e if e.message =~ /GLIBC/ warn(<<~EOM) @@ -22,5 +22,5 @@ EOM raise e end - require 'nokogiri/nokogiri' + require_relative "nokogiri" end diff --git a/lib/nokogiri/html.rb b/lib/nokogiri/html.rb index 21ce61a8dd..4fbed9bccc 100644 --- a/lib/nokogiri/html.rb +++ b/lib/nokogiri/html.rb @@ -1,38 +1,42 @@ # frozen_string_literal: true -require 'nokogiri/html/entity_lookup' -require 'nokogiri/html/document' -require 'nokogiri/html/document_fragment' -require 'nokogiri/html/sax/parser_context' -require 'nokogiri/html/sax/parser' -require 'nokogiri/html/sax/push_parser' -require 'nokogiri/html/element_description' -require 'nokogiri/html/element_description_defaults' +require_relative "html4" module Nokogiri - class << self - ### - # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse - def HTML thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block - Nokogiri::HTML::Document.parse(thing, url, encoding, options, &block) - end - end + HTML = Nokogiri::HTML4 + + # @!method HTML(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) + # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse + # @!scope class + define_singleton_method(:HTML, Nokogiri.method(:HTML4)) + # @note This module/namespace is an alias for {Nokogiri::HTML4} as of v1.12.0. Before v1.12.0, + # {Nokogiri::HTML4} did not exist, and this was the module/namespace for all HTML-related + # classes. module HTML - class << self - ### - # Parse HTML. Convenience method for Nokogiri::HTML::Document.parse - def parse thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block - Document.parse(thing, url, encoding, options, &block) + # @note This class is an alias for {Nokogiri::HTML4::Document} as of v1.12.0. + class Document < Nokogiri::XML::Document + end + + # @note This class is an alias for {Nokogiri::HTML4::DocumentFragment} as of v1.12.0. + class DocumentFragment < Nokogiri::XML::DocumentFragment + end + + # @note This class is an alias for {Nokogiri::HTML4::Builder} as of v1.12.0. + class Builder < Nokogiri::XML::Builder + end + + module SAX + # @note This class is an alias for {Nokogiri::HTML4::SAX::Parser} as of v1.12.0. + class Parser < Nokogiri::XML::SAX::Parser end - #### - # Parse a fragment from +string+ in to a NodeSet. - def fragment string, encoding = nil - HTML::DocumentFragment.parse string, encoding + # @note This class is an alias for {Nokogiri::HTML4::SAX::ParserContext} as of v1.12.0. + class ParserContext < Nokogiri::XML::SAX::ParserContext end - end - # Instance of Nokogiri::HTML::EntityLookup - NamedCharacters = EntityLookup.new + # @note This class is an alias for {Nokogiri::HTML4::SAX::PushParser} as of v1.12.0. + class PushParser + end + end end end diff --git a/lib/nokogiri/html/sax/parser_context.rb b/lib/nokogiri/html/sax/parser_context.rb deleted file mode 100644 index 5eb48ca329..0000000000 --- a/lib/nokogiri/html/sax/parser_context.rb +++ /dev/null @@ -1,17 +0,0 @@ -# frozen_string_literal: true -module Nokogiri - module HTML - module SAX - ### - # Context for HTML SAX parsers. This class is usually not instantiated - # by the user. Instead, you should be looking at - # Nokogiri::HTML::SAX::Parser - class ParserContext < Nokogiri::XML::SAX::ParserContext - def self.new thing, encoding = 'UTF-8' - [:read, :close].all? { |x| thing.respond_to?(x) } ? super : - memory(thing, encoding) - end - end - end - end -end diff --git a/lib/nokogiri/html4.rb b/lib/nokogiri/html4.rb new file mode 100644 index 0000000000..b2fe513509 --- /dev/null +++ b/lib/nokogiri/html4.rb @@ -0,0 +1,40 @@ +# frozen_string_literal: true +module Nokogiri + class << self + ### + # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse + def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) + Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block) + end + end + + # @since v1.12.0 + # @note Before v1.12.0, {Nokogiri::HTML4} did not exist, and {Nokogiri::HTML} was the module/namespace for parsing HTML. + module HTML4 + class << self + ### + # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse + def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) + Document.parse(input, url, encoding, options, &block) + end + + #### + # Parse a fragment from +string+ in to a NodeSet. + def fragment(string, encoding = nil) + HTML4::DocumentFragment.parse(string, encoding) + end + end + + # Instance of Nokogiri::HTML4::EntityLookup + NamedCharacters = EntityLookup.new + end +end + +require_relative "html4/entity_lookup" +require_relative "html4/document" +require_relative "html4/document_fragment" +require_relative "html4/sax/parser_context" +require_relative "html4/sax/parser" +require_relative "html4/sax/push_parser" +require_relative "html4/element_description" +require_relative "html4/element_description_defaults" diff --git a/lib/nokogiri/html/builder.rb b/lib/nokogiri/html4/builder.rb similarity index 93% rename from lib/nokogiri/html/builder.rb rename to lib/nokogiri/html4/builder.rb index 098588ae87..a875e4eedc 100644 --- a/lib/nokogiri/html/builder.rb +++ b/lib/nokogiri/html4/builder.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true module Nokogiri - module HTML + module HTML4 ### # Nokogiri HTML builder is used for building HTML documents. It is very # similar to the Nokogiri::XML::Builder. In fact, you should go read the @@ -12,7 +12,7 @@ module HTML # Create an HTML document with a body that has an onload attribute, and a # span tag with a class of "bold" that has content of "Hello world". # - # builder = Nokogiri::HTML::Builder.new do |doc| + # builder = Nokogiri::HTML4::Builder.new do |doc| # doc.html { # doc.body(:onload => 'some_func();') { # doc.span.bold { diff --git a/lib/nokogiri/html/document.rb b/lib/nokogiri/html4/document.rb similarity index 97% rename from lib/nokogiri/html/document.rb rename to lib/nokogiri/html4/document.rb index c54676c92f..07f4d0968e 100644 --- a/lib/nokogiri/html/document.rb +++ b/lib/nokogiri/html4/document.rb @@ -3,7 +3,7 @@ require 'pathname' module Nokogiri - module HTML + module HTML4 class Document < Nokogiri::XML::Document ### # Get the meta tag encoding for this document. If there is no meta tag, @@ -268,12 +268,12 @@ def self.detect_encoding(chunk) m = chunk.match(/(#{tags}", nil, document.encoding) + temp_doc = HTML4::Document.parse("#{tags}", nil, document.encoding) temp_doc.xpath(path).each { |child| child.parent = self } self.errors = temp_doc.errors end diff --git a/lib/nokogiri/html/element_description.rb b/lib/nokogiri/html4/element_description.rb similarity index 96% rename from lib/nokogiri/html/element_description.rb rename to lib/nokogiri/html4/element_description.rb index 6c415774da..b025a2ea9e 100644 --- a/lib/nokogiri/html/element_description.rb +++ b/lib/nokogiri/html4/element_description.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true module Nokogiri - module HTML + module HTML4 class ElementDescription ### # Is this element a block element? diff --git a/lib/nokogiri/html/element_description_defaults.rb b/lib/nokogiri/html4/element_description_defaults.rb similarity index 99% rename from lib/nokogiri/html/element_description_defaults.rb rename to lib/nokogiri/html4/element_description_defaults.rb index 6763cd496e..f336eec612 100644 --- a/lib/nokogiri/html/element_description_defaults.rb +++ b/lib/nokogiri/html4/element_description_defaults.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true module Nokogiri - module HTML + module HTML4 class ElementDescription # Methods are defined protected by method_defined? because at diff --git a/lib/nokogiri/html/entity_lookup.rb b/lib/nokogiri/html4/entity_lookup.rb similarity index 94% rename from lib/nokogiri/html/entity_lookup.rb rename to lib/nokogiri/html4/entity_lookup.rb index bc66e41afe..a6034e278c 100644 --- a/lib/nokogiri/html/entity_lookup.rb +++ b/lib/nokogiri/html4/entity_lookup.rb @@ -1,6 +1,6 @@ # frozen_string_literal: true module Nokogiri - module HTML + module HTML4 class EntityDescription < Struct.new(:value, :name, :description); end class EntityLookup diff --git a/lib/nokogiri/html/sax/parser.rb b/lib/nokogiri/html4/sax/parser.rb similarity index 75% rename from lib/nokogiri/html/sax/parser.rb rename to lib/nokogiri/html4/sax/parser.rb index 16853e6957..356f37c96c 100644 --- a/lib/nokogiri/html/sax/parser.rb +++ b/lib/nokogiri/html4/sax/parser.rb @@ -1,18 +1,15 @@ # frozen_string_literal: true module Nokogiri - module HTML + module HTML4 ### - # Nokogiri lets you write a SAX parser to process HTML but get HTML - # correction features. + # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features. # - # See Nokogiri::HTML::SAX::Parser for a basic example of using a - # SAX parser with HTML. + # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML. # # For more information on SAX parsers, see Nokogiri::XML::SAX module SAX ### - # This class lets you perform SAX style parsing on HTML with HTML - # error correction. + # This class lets you perform SAX style parsing on HTML with HTML error correction. # # Here is a basic usage example: # @@ -22,40 +19,40 @@ module SAX # end # end # - # parser = Nokogiri::HTML::SAX::Parser.new(MyDoc.new) + # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new) # parser.parse(File.read(ARGV[0], mode: 'rb')) # # For more information on SAX parsers, see Nokogiri::XML::SAX class Parser < Nokogiri::XML::SAX::Parser ### # Parse html stored in +data+ using +encoding+ - def parse_memory data, encoding = 'UTF-8' + def parse_memory(data, encoding = "UTF-8") raise ArgumentError unless data return unless data.length > 0 ctx = ParserContext.memory(data, encoding) yield ctx if block_given? - ctx.parse_with self + ctx.parse_with(self) end ### # Parse given +io+ - def parse_io io, encoding = 'UTF-8' + def parse_io(io, encoding = "UTF-8") check_encoding(encoding) @encoding = encoding ctx = ParserContext.io(io, ENCODINGS[encoding]) yield ctx if block_given? - ctx.parse_with self + ctx.parse_with(self) end ### # Parse a file with +filename+ - def parse_file filename, encoding = 'UTF-8' + def parse_file(filename, encoding = "UTF-8") raise ArgumentError unless filename raise Errno::ENOENT unless File.exist?(filename) raise Errno::EISDIR if File.directory?(filename) ctx = ParserContext.file(filename, encoding) yield ctx if block_given? - ctx.parse_with self + ctx.parse_with(self) end end end diff --git a/lib/nokogiri/html4/sax/parser_context.rb b/lib/nokogiri/html4/sax/parser_context.rb new file mode 100644 index 0000000000..6d964a3059 --- /dev/null +++ b/lib/nokogiri/html4/sax/parser_context.rb @@ -0,0 +1,19 @@ +# frozen_string_literal: true +module Nokogiri + module HTML4 + module SAX + ### + # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead, + # you should be looking at Nokogiri::HTML4::SAX::Parser + class ParserContext < Nokogiri::XML::SAX::ParserContext + def self.new(thing, encoding = "UTF-8") + if [:read, :close].all? { |x| thing.respond_to?(x) } + super + else + memory(thing, encoding) + end + end + end + end + end +end diff --git a/lib/nokogiri/html/sax/push_parser.rb b/lib/nokogiri/html4/sax/push_parser.rb similarity index 70% rename from lib/nokogiri/html/sax/push_parser.rb rename to lib/nokogiri/html4/sax/push_parser.rb index 6054e00a27..2794757199 100644 --- a/lib/nokogiri/html/sax/push_parser.rb +++ b/lib/nokogiri/html4/sax/push_parser.rb @@ -1,17 +1,17 @@ # frozen_string_literal: true module Nokogiri - module HTML + module HTML4 module SAX class PushParser - # The Nokogiri::HTML::SAX::Document on which the PushParser will be + # The Nokogiri::HTML4::SAX::Document on which the PushParser will be # operating attr_accessor :document - def initialize(doc = HTML::SAX::Document.new, file_name = nil, encoding = 'UTF-8') + def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = 'UTF-8') @document = doc @encoding = encoding - @sax_parser = HTML::SAX::Parser.new(doc, @encoding) + @sax_parser = HTML4::SAX::Parser.new(doc, @encoding) ## Create our push parser context initialize_native(@sax_parser, file_name, encoding) @@ -27,7 +27,7 @@ def write chunk, last_chunk = false ### # Finish the parsing. This method is only necessary for - # Nokogiri::HTML::SAX::Document#end_document to be called. + # Nokogiri::HTML4::SAX::Document#end_document to be called. def finish write '', true end diff --git a/lib/nokogiri/html5.rb b/lib/nokogiri/html5.rb index 5e994ce155..c76b4a6d4c 100644 --- a/lib/nokogiri/html5.rb +++ b/lib/nokogiri/html5.rb @@ -1,3 +1,4 @@ +# coding: utf-8 # frozen_string_literal: true # # Copyright 2013-2021 Sam Ruby, Stephen Checkoway @@ -22,11 +23,203 @@ module Nokogiri # @since v1.12.0 # @note HTML5 functionality is not available when running JRuby. - # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse - def self.HTML5(string_or_io, url = nil, encoding = nil, **options, &block) - Nokogiri::HTML5::Document.parse(string_or_io, url, encoding, **options, &block) + # Parse an HTML5 document. Convenience method for {Nokogiri::HTML5::Document.parse} + def self.HTML5(input, url = nil, encoding = nil, **options, &block) + Nokogiri::HTML5::Document.parse(input, url, encoding, **options, &block) end + # == Usage + # + # Parse an HTML5 document: + # + # doc = Nokogiri.HTML5(string) + # + # Parse an HTML5 fragment: + # + # fragment = Nokogiri::HTML5.fragment(string) + # + # == Parsing options + # + # The document and fragment parsing methods support options that are different from Nokogiri's. + # + # - Nokogiri.HTML5(html, url = nil, encoding = nil, options = {}) + # - Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {}) + # - Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {}) + # - Nokogiri::HTML5.fragment(html, encoding = nil, options = {}) + # - Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {}) + # + # The three currently supported options are +:max_errors+, +:max_tree_depth+ and + # +:max_attributes+, described below. + # + # === Error reporting + # + # Nokogumbo contains an experimental parse error reporting facility. By default, no parse errors + # are reported but this can be configured by passing the +:max_errors+ option to {HTML5.parse} or + # {HTML5.fragment}. + # + # For example, this script: + # + # doc = Nokogiri::HTML5.parse('Hi there!', max_errors: 10) + # doc.errors.each do |err| + # puts(err) + # end + # + # Emits: + # + # 1:1: ERROR: Expected a doctype token + # Hi there! + # ^ + # 1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'. + # Hi there! + # ^ + # 1:17: ERROR: End tag ends with '/>', use '>'. + # Hi there! + # ^ + # 1:17: ERROR: End tag contains attributes. + # Hi there! + # ^ + # + # Using max_errors: -1 results in an unlimited number of errors being returned. + # + # The errors returned by {HTML5::Document#errors} are instances of {Nokogiri::XML::SyntaxError}. + # + # The {https://html.spec.whatwg.org/multipage/parsing.html#parse-errors HTML standard} defines a + # number of standard parse error codes. These error codes only cover the "tokenization" stage of + # parsing HTML. The parse errors in the "tree construction" stage do not have standardized error + # codes (yet). + # + # As a convenience to Nokogumbo users, the defined error codes are available via + # {Nokogiri::XML::SyntaxError#str1} method. + # + # doc = Nokogiri::HTML5.parse('Hi there!', max_errors: 10) + # doc.errors.each do |err| + # puts("#{err.line}:#{err.column}: #{err.str1}") + # end + # # => 1:1: generic-parser + # # 1:1: non-void-html-element-start-tag-with-trailing-solidus + # # 1:17: end-tag-with-trailing-solidus + # # 1:17: end-tag-with-attributes + # + # Note that the first error is +generic-parser+ because it's an error from the tree construction + # stage and doesn't have a standardized error code. + # + # For the purposes of semantic versioning, the error messages, error locations, and error codes + # are not part of Nokogumbo's public API. That is, these are subject to change without Nokogumbo's + # major version number changing. These may be stabilized in the future. + # + # === Maximum tree depth + # + # The maximum depth of the DOM tree parsed by the various parsing methods is configurable by the + # +:max_tree_depth+ option. If the depth of the tree would exceed this limit, then an + # {::ArgumentError} is thrown. + # + # This limit (which defaults to Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400) can be removed + # by giving the option max_tree_depth: -1. + # + # html = '' + '
' * 1000 + # doc = Nokogiri.HTML5(html) + # # raises ArgumentError: Document tree depth limit exceeded + # doc = Nokogiri.HTML5(html, max_tree_depth: -1) + # + # === Attribute limit per element + # + # The maximum number of attributes per DOM element is configurable by the +:max_attributes+ + # option. If a given element would exceed this limit, then an {::ArgumentError} is thrown. + # + # This limit (which defaults to Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400) can be removed + # by giving the option max_attributes: -1. + # + # html = '
' + # # "
" + # doc = Nokogiri.HTML5(html) + # # raises ArgumentError: Attributes per element limit exceeded + # doc = Nokogiri.HTML5(html, max_attributes: -1) + # + # == HTML Serialization + # + # After parsing HTML, it may be serialized using any of the {Nokogiri::XML::Node} serialization + # methods. In particular, {XML::Node#serialize}, {XML::Node#to_html}, and {XML::Node#to_s} will + # serialize a given node and its children. (This is the equivalent of JavaScript's + # +Element.outerHTML+.) Similarly, {XML::Node#inner_html} will serialize the children of a given + # node. (This is the equivalent of JavaScript's +Element.innerHTML+.) + # + # doc = Nokogiri::HTML5("Hello world!") + # puts doc.serialize + # # => Hello world! + # + # Due to quirks in how HTML is parsed and serialized, it's possible for a DOM tree to be + # serialized and then re-parsed, resulting in a different DOM. Mostly, this happens with DOMs + # produced from invalid HTML. Unfortunately, even valid HTML may not survive serialization and + # re-parsing. + # + # In particular, a newline at the start of +pre+, +listing+, and +textarea+ elements is ignored by + # the parser. + # + # doc = Nokogiri::HTML5(<<-EOF) + # + #
+  #   Content
+ # EOF + # puts doc.at('/html/body/pre').serialize + # # =>
Content
+ # + # In this case, the original HTML is semantically equivalent to the serialized version. If the + # +pre+, +listing+, or +textarea+ content starts with two newlines, the first newline will be + # stripped on the first parse and the second newline will be stripped on the second, leading to + # semantically different DOMs. Passing the parameter preserve_newline: true will cause + # two or more newlines to be preserved. (A single leading newline will still be removed.) + # + # doc = Nokogiri::HTML5(<<-EOF) + # + # + # + # Content + # EOF + # puts doc.at('/html/body/listing').serialize(preserve_newline: true) + # # => + # # + # # Content + # + # == Encodings + # + # Nokogumbo always parses HTML using {https://en.wikipedia.org/wiki/UTF-8 UTF-8}; however, the + # encoding of the input can be explicitly selected via the optional +encoding+ parameter. This is + # most useful when the input comes not from a string but from an IO object. + # + # When serializing a document or node, the encoding of the output string can be specified via the + # +:encoding+ options. Characters that cannot be encoded in the selected encoding will be encoded + # as {https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references HTML numeric + # entities}. + # + # frag = Nokogiri::HTML5.fragment('์•„๋Š” ๊ธธ๋„ ๋ฌผ์–ด๊ฐ€๋ผ') + # html = frag.serialize(encoding: 'US-ASCII') + # puts html + # # => 아는 길도 물어가라 + # frag = Nokogiri::HTML5.fragment(html) + # puts frag.serialize + # # => ์•„๋Š” ๊ธธ๋„ ๋ฌผ์–ด๊ฐ€๋ผ + # + # (There's a {https://bugs.ruby-lang.org/issues/15033 bug} in all current versions of Ruby that + # can cause the entity encoding to fail. Of the mandated supported encodings for HTML, the only + # encoding I'm aware of that has this bug is 'ISO-2022-JP'. We recommend avoiding this + # encoding.) + # + # == Notes + # + # * The {Nokogiri::HTML5.fragment} function takes a string and parses it + # as a HTML5 document. The ++, ++, and ++ elements are + # removed from this document, and any children of these elements that remain + # are returned as a {Nokogiri::HTML5::DocumentFragment}. + # + # * The {Nokogiri::HTML5.parse} function takes a string and passes it to the + # gumbo_parse_with_options method, using the default options. + # The resulting Gumbo parse tree is then walked. + # + # * Instead of uppercase element names, lowercase element names are produced. + # + # * Instead of returning +unknown+ as the element name for unknown tags, the + # original tag name is returned verbatim. + # # @since v1.12.0 # @note HTML5 functionality is not available when running JRuby. module HTML5 @@ -38,13 +231,13 @@ module HTML5 XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze - # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse + # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse} def self.parse(string, url = nil, encoding = nil, **options, &block) Document.parse(string, url, encoding, **options, &block) end # Parse a fragment from +string+. Convenience method for - # Nokogiri::HTML5::DocumentFragment.parse. + # {Nokogiri::HTML5::DocumentFragment.parse}. def self.fragment(string, encoding = nil, **options) DocumentFragment.parse(string, encoding, options) end @@ -57,6 +250,14 @@ def self.fragment(string, encoding = nil, **options) # * :follow_limit => number of redirects which are followed # * :basic_auth => [username, password] def self.get(uri, options={}) + warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.", + uplevel: 1, category: :deprecated) + get_impl(uri, options) + end + + private + + def self.get_impl(uri, options={}) headers = options.clone headers = {:follow_limit => headers} if Numeric === headers # deprecated limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10 @@ -99,14 +300,12 @@ def self.get(uri, options={}) when Net::HTTPRedirection response.value if limit <= 1 location = URI.join(uri, response['location']) - get(location, options.merge(:follow_limit => limit-1)) + get_impl(location, options.merge(:follow_limit => limit-1)) else response.value end end - private - def self.read_and_encode(string, encoding) # Read the string with the given encoding. if string.respond_to?(:read) @@ -131,15 +330,14 @@ def self.read_and_encode(string, encoding) string end - # Charset sniffing is a complex and controversial topic that understandably - # isn't done _by default_ by the Ruby Net::HTTP library. This being said, - # it is a very real problem for consumers of HTML as the default for HTML - # is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser - # *only* supports utf-8. + # Charset sniffing is a complex and controversial topic that understandably isn't done _by + # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for + # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and + # the Gumbo parser *only* supports utf-8. # - # Accordingly, Nokogiri::HTML::Document.parse provides limited encoding - # detection. Following this lead, Nokogiri::HTML5 attempts to do likewise, - # while attempting to more closely follow the HTML5 standard. + # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following + # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow + # the HTML5 standard. # # http://bugs.ruby-lang.org/issues/2567 # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding diff --git a/lib/nokogiri/html5/document.rb b/lib/nokogiri/html5/document.rb index 47bc078edc..fe74d99b75 100644 --- a/lib/nokogiri/html5/document.rb +++ b/lib/nokogiri/html5/document.rb @@ -15,11 +15,13 @@ # limitations under the License. # +require_relative "../html4/document" + module Nokogiri module HTML5 # @since v1.12.0 # @note HTML5 functionality is not available when running JRuby. - class Document < Nokogiri::HTML::Document + class Document < Nokogiri::HTML4::Document def self.parse(string_or_io, url = nil, encoding = nil, **options, &block) yield options if block_given? string_or_io = '' unless string_or_io diff --git a/lib/nokogiri/html5/document_fragment.rb b/lib/nokogiri/html5/document_fragment.rb index 88629b0ca5..66c6b413a9 100644 --- a/lib/nokogiri/html5/document_fragment.rb +++ b/lib/nokogiri/html5/document_fragment.rb @@ -15,13 +15,13 @@ # limitations under the License. # -require 'nokogiri/html/document_fragment' +require_relative "../html4/document_fragment" module Nokogiri module HTML5 # @since v1.12.0 # @note HTML5 functionality is not available when running JRuby. - class DocumentFragment < Nokogiri::HTML::DocumentFragment + class DocumentFragment < Nokogiri::HTML4::DocumentFragment attr_accessor :document attr_accessor :errors @@ -48,11 +48,11 @@ def serialize(options = {}, &block) def self.parse(tags, encoding = nil, options = {}) doc = HTML5::Document.new tags = HTML5.read_and_encode(tags, encoding) - doc.encoding = 'UTF-8' + doc.encoding = "UTF-8" new(doc, tags, nil, options) end - def extract_params params # :nodoc: + def extract_params(params) # :nodoc: handler = params.find do |param| ![Hash, String, Symbol].include?(param.class) end @@ -67,14 +67,13 @@ def extract_params params # :nodoc: ns ||= begin - ns = Hash.new + ns = {} children.each { |child| ns.merge!(child.namespaces) } ns end [params, handler, ns, binds] end - end end end diff --git a/lib/nokogiri/html5/node.rb b/lib/nokogiri/html5/node.rb index ea36163c1a..92cf1b9470 100644 --- a/lib/nokogiri/html5/node.rb +++ b/lib/nokogiri/html5/node.rb @@ -15,32 +15,13 @@ # limitations under the License. # -require 'nokogiri/xml/node' +require_relative "../xml/node" module Nokogiri module HTML5 # @since v1.12.0 # @note HTML5 functionality is not available when running JRuby. module Node - # HTML elements can have attributes that contain colons. - # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName - # and tries to create an attribute in a namespace. This is especially - # annoying with attribute names like xml:lang since libxml2 will - # actually create the xml namespace if it doesn't exist already. - def add_child_node_and_reparent_attrs(node) - return super(node) unless document.is_a?(HTML5::Document) - # I'm not sure what this method is supposed to do. Reparenting - # namespaces is handled by libxml2, including child namespaces which - # this method wouldn't handle. - # https://github.com/sparklemotion/nokogiri/issues/1790 - add_child_node(node) - #node.attribute_nodes.find_all { |a| a.namespace }.each do |attr| - # attr.remove - # ns = attr.namespace - # a["#{ns.prefix}:#{attr.name}"] = attr.value - #end - end - def inner_html(options = {}) return super(options) unless document.is_a?(HTML5::Document) result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? String.new("\n") : String.new @@ -59,20 +40,20 @@ def write_to(io, *options) save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT indent_times = options[:indent] || 2 end - indent_string = (options[:indent_text] || ' ') * indent_times + indent_string = (options[:indent_text] || " ") * indent_times config = XML::Node::SaveOptions.new(save_options.to_i) yield config if block_given? config_options = config.options - if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0) + if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0 # Use Nokogiri's serializing code. native_write_to(io, encoding, indent_string, config_options) else # Serialize including the current node. encoding ||= document.encoding || Encoding::UTF_8 internal_ops = { - preserve_newline: options[:preserve_newline] || false + preserve_newline: options[:preserve_newline] || false, } HTML5.serialize_node_internal(self, io, encoding, internal_ops) end @@ -82,6 +63,27 @@ def fragment(tags) return super(tags) unless document.is_a?(HTML5::Document) DocumentFragment.new(document, tags, self) end + + private + + # HTML elements can have attributes that contain colons. + # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName + # and tries to create an attribute in a namespace. This is especially + # annoying with attribute names like xml:lang since libxml2 will + # actually create the xml namespace if it doesn't exist already. + def add_child_node_and_reparent_attrs(node) + return super(node) unless document.is_a?(HTML5::Document) + # I'm not sure what this method is supposed to do. Reparenting + # namespaces is handled by libxml2, including child namespaces which + # this method wouldn't handle. + # https://github.com/sparklemotion/nokogiri/issues/1790 + add_child_node(node) + # node.attribute_nodes.find_all { |a| a.namespace }.each do |attr| + # attr.remove + # ns = attr.namespace + # a["#{ns.prefix}:#{attr.name}"] = attr.value + # end + end end # Monkey patch XML::Node.prepend(HTML5::Node) diff --git a/lib/nokogiri/version/info.rb b/lib/nokogiri/version/info.rb index 309490defc..460408d186 100644 --- a/lib/nokogiri/version/info.rb +++ b/lib/nokogiri/version/info.rb @@ -206,9 +206,9 @@ def self.jruby? # :nodoc: # Ensure constants used in this file are loaded - see #1896 if Nokogiri.jruby? - require "nokogiri/jruby/dependencies" + require_relative "../jruby/dependencies" end - require "nokogiri/extension" + require_relative "../extension" # More complete version information about libxml VERSION_INFO = VersionInfo.instance.to_hash diff --git a/lib/nokogiri/xml.rb b/lib/nokogiri/xml.rb index 93a898b951..e1e21c05ea 100644 --- a/lib/nokogiri/xml.rb +++ b/lib/nokogiri/xml.rb @@ -1,38 +1,9 @@ # frozen_string_literal: true -require 'nokogiri/xml/pp' -require 'nokogiri/xml/parse_options' -require 'nokogiri/xml/sax' -require 'nokogiri/xml/searchable' -require 'nokogiri/xml/node' -require 'nokogiri/xml/attribute_decl' -require 'nokogiri/xml/element_decl' -require 'nokogiri/xml/element_content' -require 'nokogiri/xml/character_data' -require 'nokogiri/xml/namespace' -require 'nokogiri/xml/attr' -require 'nokogiri/xml/dtd' -require 'nokogiri/xml/cdata' -require 'nokogiri/xml/text' -require 'nokogiri/xml/document' -require 'nokogiri/xml/document_fragment' -require 'nokogiri/xml/processing_instruction' -require 'nokogiri/xml/node_set' -require 'nokogiri/xml/syntax_error' -require 'nokogiri/xml/xpath' -require 'nokogiri/xml/xpath_context' -require 'nokogiri/xml/builder' -require 'nokogiri/xml/reader' -require 'nokogiri/xml/notation' -require 'nokogiri/xml/entity_decl' -require 'nokogiri/xml/entity_reference' -require 'nokogiri/xml/schema' -require 'nokogiri/xml/relax_ng' - module Nokogiri class << self ### # Parse XML. Convenience method for Nokogiri::XML::Document.parse - def XML thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_XML, &block + def XML(thing, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_XML, &block) Nokogiri::XML::Document.parse(thing, url, encoding, options, &block) end end @@ -41,20 +12,19 @@ module XML # Original C14N 1.0 spec canonicalization XML_C14N_1_0 = 0 # Exclusive C14N 1.0 spec canonicalization - XML_C14N_EXCLUSIVE_1_0 = 1 + XML_C14N_EXCLUSIVE_1_0 = 1 # C14N 1.1 spec canonicalization XML_C14N_1_1 = 2 class << self ### # Parse an XML document using the Nokogiri::XML::Reader API. See # Nokogiri::XML::Reader for mor information - def Reader string_or_io, url = nil, encoding = nil, options = ParseOptions::STRICT - + def Reader(string_or_io, url = nil, encoding = nil, options = ParseOptions::STRICT) options = Nokogiri::XML::ParseOptions.new(options) if Integer === options # Give the options to the user yield options if block_given? - if string_or_io.respond_to? :read + if string_or_io.respond_to?(:read) return Reader.from_io(string_or_io, url, encoding, options.to_i) end Reader.from_memory(string_or_io, url, encoding, options.to_i) @@ -62,15 +32,44 @@ def Reader string_or_io, url = nil, encoding = nil, options = ParseOptions::STRI ### # Parse XML. Convenience method for Nokogiri::XML::Document.parse - def parse thing, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, &block + def parse(thing, url = nil, encoding = nil, options = ParseOptions::DEFAULT_XML, &block) Document.parse(thing, url, encoding, options, &block) end #### # Parse a fragment from +string+ in to a NodeSet. - def fragment string + def fragment(string) XML::DocumentFragment.parse(string) end end end end + +require_relative "xml/pp" +require_relative "xml/parse_options" +require_relative "xml/sax" +require_relative "xml/searchable" +require_relative "xml/node" +require_relative "xml/attribute_decl" +require_relative "xml/element_decl" +require_relative "xml/element_content" +require_relative "xml/character_data" +require_relative "xml/namespace" +require_relative "xml/attr" +require_relative "xml/dtd" +require_relative "xml/cdata" +require_relative "xml/text" +require_relative "xml/document" +require_relative "xml/document_fragment" +require_relative "xml/processing_instruction" +require_relative "xml/node_set" +require_relative "xml/syntax_error" +require_relative "xml/xpath" +require_relative "xml/xpath_context" +require_relative "xml/builder" +require_relative "xml/reader" +require_relative "xml/notation" +require_relative "xml/entity_decl" +require_relative "xml/entity_reference" +require_relative "xml/schema" +require_relative "xml/relax_ng" diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index 44f94ccd49..a06ba5fa56 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -1,7 +1,6 @@ # encoding: UTF-8 # frozen_string_literal: true require "stringio" -require "nokogiri/xml/node/save_options" module Nokogiri module XML @@ -837,7 +836,7 @@ def parse(string_or_io, options = nil) node_set = in_context(contents, options.to_i) if (node_set.empty? && (document.errors.length > error_count)) if options.recover? - fragment = Nokogiri::HTML::DocumentFragment.parse contents + fragment = Nokogiri::HTML4::DocumentFragment.parse contents node_set = fragment.children else raise document.errors[error_count] @@ -883,7 +882,7 @@ def xml? type == DOCUMENT_NODE end - # Returns true if this is an HTML::Document node + # Returns true if this is an HTML4::Document node def html? type == HTML_DOCUMENT_NODE end @@ -909,11 +908,11 @@ def fragment? end ### - # Fetch the Nokogiri::HTML::ElementDescription for this node. Returns + # Fetch the Nokogiri::HTML4::ElementDescription for this node. Returns # nil on XML documents and on unknown tags. def description return nil if document.xml? - Nokogiri::HTML::ElementDescription[name] + Nokogiri::HTML4::ElementDescription[name] end ### @@ -1235,3 +1234,5 @@ def add_child_node_and_reparent_attrs(node) end end end + +require_relative "node/save_options" diff --git a/lib/nokogiri/xml/pp.rb b/lib/nokogiri/xml/pp.rb index 1b0f4df850..591bc8fa79 100644 --- a/lib/nokogiri/xml/pp.rb +++ b/lib/nokogiri/xml/pp.rb @@ -1,3 +1,3 @@ # frozen_string_literal: true -require 'nokogiri/xml/pp/node' -require 'nokogiri/xml/pp/character_data' +require_relative "pp/node" +require_relative "pp/character_data" diff --git a/lib/nokogiri/xml/sax.rb b/lib/nokogiri/xml/sax.rb index e965f677b0..f4c35b7137 100644 --- a/lib/nokogiri/xml/sax.rb +++ b/lib/nokogiri/xml/sax.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true -require 'nokogiri/xml/sax/document' -require 'nokogiri/xml/sax/parser_context' -require 'nokogiri/xml/sax/parser' -require 'nokogiri/xml/sax/push_parser' +require_relative "sax/document" +require_relative "sax/parser_context" +require_relative "sax/parser" +require_relative "sax/push_parser" diff --git a/lib/nokogiri/xml/sax/document.rb b/lib/nokogiri/xml/sax/document.rb index 0f32e8930a..8336b74789 100644 --- a/lib/nokogiri/xml/sax/document.rb +++ b/lib/nokogiri/xml/sax/document.rb @@ -2,20 +2,19 @@ module Nokogiri module XML ### - # SAX Parsers are event driven parsers. Nokogiri provides two different - # event based parsers when dealing with XML. If you want to do SAX style - # parsing using HTML, check out Nokogiri::HTML::SAX. + # SAX Parsers are event driven parsers. Nokogiri provides two different event based parsers when + # dealing with XML. If you want to do SAX style parsing using HTML, check out + # Nokogiri::HTML4::SAX. # - # The basic way a SAX style parser works is by creating a parser, - # telling the parser about the events we're interested in, then giving - # the parser some XML to process. The parser will notify you when - # it encounters events you said you would like to know about. + # The basic way a SAX style parser works is by creating a parser, telling the parser about the + # events we're interested in, then giving the parser some XML to process. The parser will notify + # you when it encounters events you said you would like to know about. # - # To register for events, you simply subclass Nokogiri::XML::SAX::Document, - # and implement the methods for which you would like notification. + # To register for events, you simply subclass Nokogiri::XML::SAX::Document, and implement the + # methods for which you would like notification. # - # For example, if I want to be notified when a document ends, and when an - # element starts, I would write a class like this: + # For example, if I want to be notified when a document ends, and when an element starts, I + # would write a class like this: # # class MyDocument < Nokogiri::XML::SAX::Document # def end_document @@ -27,8 +26,7 @@ module XML # end # end # - # Then I would instantiate a SAX parser with this document, and feed the - # parser some XML + # Then I would instantiate a SAX parser with this document, and feed the parser some XML # # # Create a new parser # parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new) @@ -36,25 +34,21 @@ module XML # # Feed the parser some XML # parser.parse(File.open(ARGV[0])) # - # Now my document handler will be called when each node starts, and when - # then document ends. To see what kinds of events are available, take - # a look at Nokogiri::XML::SAX::Document. + # Now my document handler will be called when each node starts, and when then document ends. To + # see what kinds of events are available, take a look at Nokogiri::XML::SAX::Document. # - # Two SAX parsers for XML are available, a parser that reads from a string - # or IO object as it feels necessary, and a parser that lets you spoon - # feed it XML. If you want to let Nokogiri deal with reading your XML, - # use the Nokogiri::XML::SAX::Parser. If you want to have fine grain + # Two SAX parsers for XML are available, a parser that reads from a string or IO object as it + # feels necessary, and a parser that lets you spoon feed it XML. If you want to let Nokogiri + # deal with reading your XML, use the Nokogiri::XML::SAX::Parser. If you want to have fine grain # control over the XML input, use the Nokogiri::XML::SAX::PushParser. module SAX ### - # This class is used for registering types of events you are interested - # in handling. All of the methods on this class are available as - # possible events while parsing an XML document. To register for any - # particular event, just subclass this class and implement the methods - # you are interested in knowing about. + # This class is used for registering types of events you are interested in handling. All of + # the methods on this class are available as possible events while parsing an XML document. To + # register for any particular event, just subclass this class and implement the methods you + # are interested in knowing about. # - # To only be notified about start and end element events, write a class - # like this: + # To only be notified about start and end element events, write a class like this: # # class MyDocument < Nokogiri::XML::SAX::Document # def start_element name, attrs = [] @@ -66,8 +60,8 @@ module SAX # end # end # - # You can use this event handler for any SAX style parser included with - # Nokogiri. See Nokogiri::XML::SAX, and Nokogiri::HTML::SAX. + # You can use this event handler for any SAX style parser included with Nokogiri. See + # Nokogiri::XML::SAX, and Nokogiri::HTML4::SAX. class Document ### # Called when an XML declaration is parsed @@ -129,7 +123,7 @@ def end_element_namespace name, prefix = nil, uri = nil end ### - # Characters read between a tag. This method might be called multiple + # Characters read between a tag. This method might be called multiple # times given one contiguous string of characters. # # +string+ contains the character data diff --git a/lib/nokogiri/xml/xpath.rb b/lib/nokogiri/xml/xpath.rb index e2f30fe0e4..4caba97f8d 100644 --- a/lib/nokogiri/xml/xpath.rb +++ b/lib/nokogiri/xml/xpath.rb @@ -1,9 +1,9 @@ # frozen_string_literal: true -require 'nokogiri/xml/xpath/syntax_error' - module Nokogiri module XML module XPath end end end + +require_relative "xpath/syntax_error" diff --git a/lib/nokogiri/xslt.rb b/lib/nokogiri/xslt.rb index b6c9442008..87e87839c5 100644 --- a/lib/nokogiri/xslt.rb +++ b/lib/nokogiri/xslt.rb @@ -1,6 +1,4 @@ # frozen_string_literal: true -require "nokogiri/xslt/stylesheet" - module Nokogiri class << self ### @@ -55,3 +53,5 @@ def quote_params(params) end end end + +require_relative "xslt/stylesheet" diff --git a/nokogiri.gemspec b/nokogiri.gemspec index 87422c53a6..9a36de9f96 100644 --- a/nokogiri.gemspec +++ b/nokogiri.gemspec @@ -59,11 +59,11 @@ Gem::Specification.new do |spec| "bin/nokogiri", "dependencies.yml", "ext/java/nokogiri/EncodingHandler.java", - "ext/java/nokogiri/HtmlDocument.java", - "ext/java/nokogiri/HtmlElementDescription.java", - "ext/java/nokogiri/HtmlEntityLookup.java", - "ext/java/nokogiri/HtmlSaxParserContext.java", - "ext/java/nokogiri/HtmlSaxPushParser.java", + "ext/java/nokogiri/Html4Document.java", + "ext/java/nokogiri/Html4ElementDescription.java", + "ext/java/nokogiri/Html4EntityLookup.java", + "ext/java/nokogiri/Html4SaxParserContext.java", + "ext/java/nokogiri/Html4SaxPushParser.java", "ext/java/nokogiri/NokogiriService.java", "ext/java/nokogiri/XmlAttr.java", "ext/java/nokogiri/XmlAttributeDecl.java", @@ -147,11 +147,11 @@ Gem::Specification.new do |spec| "ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java", "ext/nokogiri/depend", "ext/nokogiri/extconf.rb", - "ext/nokogiri/html_document.c", - "ext/nokogiri/html_element_description.c", - "ext/nokogiri/html_entity_lookup.c", - "ext/nokogiri/html_sax_parser_context.c", - "ext/nokogiri/html_sax_push_parser.c", + "ext/nokogiri/html4_document.c", + "ext/nokogiri/html4_element_description.c", + "ext/nokogiri/html4_entity_lookup.c", + "ext/nokogiri/html4_sax_parser_context.c", + "ext/nokogiri/html4_sax_push_parser.c", "ext/nokogiri/libxml2_backwards_compat.c", "ext/nokogiri/nokogiri.c", "ext/nokogiri/nokogiri.h", @@ -244,15 +244,16 @@ Gem::Specification.new do |spec| "lib/nokogiri/extension.rb", "lib/nokogiri/gumbo.rb", "lib/nokogiri/html.rb", - "lib/nokogiri/html/builder.rb", - "lib/nokogiri/html/document.rb", - "lib/nokogiri/html/document_fragment.rb", - "lib/nokogiri/html/element_description.rb", - "lib/nokogiri/html/element_description_defaults.rb", - "lib/nokogiri/html/entity_lookup.rb", - "lib/nokogiri/html/sax/parser.rb", - "lib/nokogiri/html/sax/parser_context.rb", - "lib/nokogiri/html/sax/push_parser.rb", + "lib/nokogiri/html4.rb", + "lib/nokogiri/html4/builder.rb", + "lib/nokogiri/html4/document.rb", + "lib/nokogiri/html4/document_fragment.rb", + "lib/nokogiri/html4/element_description.rb", + "lib/nokogiri/html4/element_description_defaults.rb", + "lib/nokogiri/html4/entity_lookup.rb", + "lib/nokogiri/html4/sax/parser.rb", + "lib/nokogiri/html4/sax/parser_context.rb", + "lib/nokogiri/html4/sax/push_parser.rb", "lib/nokogiri/html5.rb", "lib/nokogiri/html5/document.rb", "lib/nokogiri/html5/document_fragment.rb", diff --git a/nokogumbo-import/README.md b/nokogumbo-import/README.md deleted file mode 100644 index 50d43cb00b..0000000000 --- a/nokogumbo-import/README.md +++ /dev/null @@ -1,304 +0,0 @@ -# Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser. - -Nokogumbo provides the ability for a Ruby program to invoke -[our version of the Gumbo HTML5 parser](https://github.com/rubys/nokogumbo/tree/master/gumbo-parser/src) -and to access the result as a -[Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document). - -[![Github Actions Build Status](https://github.com/rubys/nokogumbo/actions/workflows/ci.yml/badge.svg)](https://github.com/rubys/nokogumbo/actions) -[![Appveyor Build Status](https://ci.appveyor.com/api/projects/status/github/rubys/nokogumbo)](https://ci.appveyor.com/project/rubys/nokogumbo/branch/master) - -## Usage - -```ruby -require 'nokogumbo' -doc = Nokogiri.HTML5(string) -``` - -To parse an HTML fragment, a `fragment` method is provided. - -```ruby -require 'nokogumbo' -doc = Nokogiri::HTML5.fragment(string) -``` - -Because HTML is often fetched via the web, a convenience interface to -HTTP get is also provided: - -```ruby -require 'nokogumbo' -doc = Nokogiri::HTML5.get(uri) -``` - -## Parsing options -The document and fragment parsing methods, -- `Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})` -- `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})` -- `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})` -- `Nokogiri::HTML5.fragment(html, encoding = nil, options = {})` -- `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})` -support options that are different from Nokogiri's. - -The three currently supported options are `:max_errors`, `:max_tree_depth` and -`:max_attributes`, described below. - -### Error reporting -Nokogumbo contains an experimental parse error reporting facility. By default, -no parse errors are reported but this can be configured by passing the -`:max_errors` option to `::parse` or `::fragment`. - -```ruby -require 'nokogumbo' -doc = Nokogiri::HTML5.parse('Hi there!', max_errors: 10) -doc.errors.each do |err| - puts(err) -end -``` - -This prints the following. -``` -1:1: ERROR: Expected a doctype token -Hi there! -^ -1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'. -Hi there! -^ -1:17: ERROR: End tag ends with '/>', use '>'. -Hi there! - ^ -1:17: ERROR: End tag contains attributes. -Hi there! - ^ -``` - -Using `max_errors: -1` results in an unlimited number of errors being -returned. - -The errors returned by `#errors` are instances of -[`Nokogiri::XML::SyntaxError`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError). - -The [HTML -standard](https://html.spec.whatwg.org/multipage/parsing.html#parse-errors) -defines a number of standard parse error codes. These error codes only cover -the "tokenization" stage of parsing HTML. The parse errors in the -"tree construction" stage do not have standardized error codes (yet). - -As a convenience to Nokogumbo users, the defined error codes are available -via the -[`Nokogiri::XML::SyntaxError#str1`](https://www.rubydoc.info/github/sparklemotion/nokogiri/Nokogiri/XML/SyntaxError#str1-instance_method) -method. - -```ruby -require 'nokogumbo' -doc = Nokogiri::HTML5.parse('Hi there!', max_errors: 10) -doc.errors.each do |err| - puts("#{err.line}:#{err.column}: #{err.str1}") -end -``` - -This prints the following. -``` -1:1: generic-parser -1:1: non-void-html-element-start-tag-with-trailing-solidus -1:17: end-tag-with-trailing-solidus -1:17: end-tag-with-attributes -``` - -Note that the first error is `generic-parser` because it's an error from the -tree construction stage and doesn't have a standardized error code. - -For the purposes of semantic versioning, the error messages, error locations, -and error codes are not part of Nokogumbo's public API. That is, these are -subject to change without Nokogumbo's major version number changing. These may -be stabilized in the future. - -### Maximum tree depth -The maximum depth of the DOM tree parsed by the various parsing methods is -configurable by the `:max_tree_depth` option. If the depth of the tree would -exceed this limit, then an -[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown. - -This limit (which defaults to `Nokogumbo::DEFAULT_MAX_TREE_DEPTH = 400`) can -be removed by giving the option `max_tree_depth: -1`. - -``` ruby -html = '' + '
' * 1000 -doc = Nokogiri.HTML5(html) -# raises ArgumentError: Document tree depth limit exceeded -doc = Nokogiri.HTML5(html, max_tree_depth: -1) -``` - -### Attribute limit per element -The maximum number of attributes per DOM element is configurable by the -`:max_attributes` option. If a given element would exceed this limit, then an -[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown. - -This limit (which defaults to `Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400`) can -be removed by giving the option `max_attributes: -1`. - -``` ruby -html = '
' -# "
" -doc = Nokogiri.HTML5(html) -# raises ArgumentError: Attributes per element limit exceeded -doc = Nokogiri.HTML5(html, max_attributes: -1) -``` - -## HTML Serialization - -After parsing HTML, it may be serialized using any of the Nokogiri -[serialization -methods](https://www.rubydoc.info/gems/nokogiri/Nokogiri/XML/Node). In -particular, `#serialize`, `#to_html`, and `#to_s` will serialize a given node -and its children. (This is the equivalent of JavaScript's -`Element.outerHTML`.) Similarly, `#inner_html` will serialize the children of -a given node. (This is the equivalent of JavaScript's `Element.innerHTML`.) - -``` ruby -doc = Nokogiri::HTML5("Hello world!") -puts doc.serialize -# Prints: Hello world! -``` - -Due to quirks in how HTML is parsed and serialized, it's possible for a DOM -tree to be serialized and then re-parsed, resulting in a different DOM. -Mostly, this happens with DOMs produced from invalid HTML. Unfortunately, even -valid HTML may not survive serialization and re-parsing. - -In particular, a newline at the start of `pre`, `listing`, and `textarea` -elements is ignored by the parser. - -``` ruby -doc = Nokogiri::HTML5(<<-EOF) - -
-Content
-EOF -puts doc.at('/html/body/pre').serialize -# Prints:
Content
-``` - -In this case, the original HTML is semantically equivalent to the serialized -version. If the `pre`, `listing`, or `textarea` content starts with two -newlines, the first newline will be stripped on the first parse and the second -newline will be stripped on the second, leading to semantically different -DOMs. Passing the parameter `preserve_newline: true` will cause two or more -newlines to be preserved. (A single leading newline will still be removed.) - -``` ruby -doc = Nokogiri::HTML5(<<-EOF) - -- -Content -EOF -puts doc.at('/html/body/listing').serialize(preserve_newline: true) -# Prints: -# -# Content -``` - -## Encodings -Nokogumbo always parses HTML using -[UTF-8](https://en.wikipedia.org/wiki/UTF-8); however, the encoding of the -input can be explicitly selected via the optional `encoding` parameter. This -is most useful when the input comes not from a string but from an IO object. - -When serializing a document or node, the encoding of the output string can be -specified via the `:encoding` options. Characters that cannot be encoded in -the selected encoding will be encoded as [HTML numeric -entities](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references). - -``` ruby -frag = Nokogiri::HTML5.fragment('์•„๋Š” ๊ธธ๋„ ๋ฌผ์–ด๊ฐ€๋ผ') -html = frag.serialize(encoding: 'US-ASCII') -puts html -# Prints: 아는 길도 물어가라 -frag = Nokogiri::HTML5.fragment(html) -puts frag.serialize -# Prints: ์•„๋Š” ๊ธธ๋„ ๋ฌผ์–ด๊ฐ€๋ผ -``` - -(There's a [bug](https://bugs.ruby-lang.org/issues/15033) in all current -versions of Ruby that can cause the entity encoding to fail. Of the mandated -supported encodings for HTML, the only encoding I'm aware of that has this bug -is `'ISO-2022-JP'`. I recommend avoiding this encoding.) - -## Examples -```ruby -require 'nokogumbo' -puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text -``` - -## Notes - -* The `Nokogiri::HTML5.fragment` function takes a string and parses it - as a HTML5 document. The ``, ``, and `` elements are - removed from this document, and any children of these elements that remain - are returned as a `Nokogiri::HTML::DocumentFragment`. -* The `Nokogiri::HTML5.parse` function takes a string and passes it to the -gumbo_parse_with_options method, using the default options. -The resulting Gumbo parse tree is then walked. - * If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers - can be found at installation time then an - [xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced - and a single Nokogiri Ruby object is constructed to wrap the xmlDoc - structure. Nokogiri only produces Ruby objects as necessary, so all - searching is done using the underlying libxml2 libraries. - * If the necessary headers are not present at installation time, then - Nokogiri Ruby objects are created for each Gumbo node. Other than - memory usage and CPU time, the results should be equivalent. - -* The `Nokogiri::HTML5.get` function takes care of following redirects, -https, and determining the character encoding of the result, based on the -rules defined in the HTML5 specification for doing so. - -* Instead of uppercase element names, lowercase element names are produced. - -* Instead of returning `unknown` as the element name for unknown tags, the -original tag name is returned verbatim. - -# Flavors of Nokogumbo -Nokogumbo uses libxml2, the XML library underlying Nokogiri, to speed up -parsing. If the libxml2 headers are not available, then Nokogumbo resorts to -using Nokogiri's Ruby API to construct the DOM tree. - -Nokogiri can be configured to either use the system library version of libxml2 -or use a bundled version. By default (as of Nokogiri version 1.8.4), Nokogiri -will use a bundled version. - -To prevent differences between versions of libxml2, Nokogumbo will only use -libxml2 if the build process can find the exact same version used by Nokogiri. -This leads to three possibilities - -1. Nokogiri is compiled with the bundled libxml2. In this case, Nokogumbo will - (by default) use the same version of libxml2. -2. Nokogiri is compiled with the system libxml2. In this case, if the libxml2 - headers are available, then Nokogumbo will (by default) use the system - version and headers. -3. Nokogiri is compiled with the system libxml2 but its headers aren't - available at build time for Nokogumbo. In this case, Nokogumbo will use the - slower Ruby API. - -Using libxml2 can be required by passing `-- --with-libxml2` to `bundle exec -rake` or to `gem install`. Using libxml2 can be prohibited by instead passing -`-- --without-libxml2`. - -Functionally, the only difference between using libxml2 or not is in the -behavior of `Nokogiri::XML::Node#line`. If it is used, then `#line` will -return the line number of the corresponding node. Otherwise, it will return 0. - -# Installation - - git clone https://github.com/rubys/nokogumbo.git - cd nokogumbo - bundle install - rake gem - gem install pkg/nokogumbo*.gem - -# Related efforts - -* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) -- a ruby binding - for the Gumbo HTML5 parser. -* [lua-gumbo](https://gitlab.com/craigbarnes/lua-gumbo) -- a lua binding for - the Gumbo HTML5 parser. diff --git a/test/html/sax/test_parser.rb b/test/html4/sax/test_parser.rb similarity index 100% rename from test/html/sax/test_parser.rb rename to test/html4/sax/test_parser.rb diff --git a/test/html/sax/test_parser_context.rb b/test/html4/sax/test_parser_context.rb similarity index 100% rename from test/html/sax/test_parser_context.rb rename to test/html4/sax/test_parser_context.rb diff --git a/test/html/sax/test_parser_text.rb b/test/html4/sax/test_parser_text.rb similarity index 100% rename from test/html/sax/test_parser_text.rb rename to test/html4/sax/test_parser_text.rb diff --git a/test/html/sax/test_push_parser.rb b/test/html4/sax/test_push_parser.rb similarity index 100% rename from test/html/sax/test_push_parser.rb rename to test/html4/sax/test_push_parser.rb diff --git a/test/html/test_attributes.rb b/test/html4/test_attributes.rb similarity index 100% rename from test/html/test_attributes.rb rename to test/html4/test_attributes.rb diff --git a/test/html/test_attributes_properly_escaped.rb b/test/html4/test_attributes_properly_escaped.rb similarity index 100% rename from test/html/test_attributes_properly_escaped.rb rename to test/html4/test_attributes_properly_escaped.rb diff --git a/test/html/test_builder.rb b/test/html4/test_builder.rb similarity index 100% rename from test/html/test_builder.rb rename to test/html4/test_builder.rb diff --git a/test/html/test_comments.rb b/test/html4/test_comments.rb similarity index 100% rename from test/html/test_comments.rb rename to test/html4/test_comments.rb diff --git a/test/html/test_document.rb b/test/html4/test_document.rb similarity index 100% rename from test/html/test_document.rb rename to test/html4/test_document.rb diff --git a/test/html/test_document_encoding.rb b/test/html4/test_document_encoding.rb similarity index 100% rename from test/html/test_document_encoding.rb rename to test/html4/test_document_encoding.rb diff --git a/test/html/test_document_fragment.rb b/test/html4/test_document_fragment.rb similarity index 100% rename from test/html/test_document_fragment.rb rename to test/html4/test_document_fragment.rb diff --git a/test/html/test_element_description.rb b/test/html4/test_element_description.rb similarity index 100% rename from test/html/test_element_description.rb rename to test/html4/test_element_description.rb diff --git a/test/html4/test_html_module.rb b/test/html4/test_html_module.rb new file mode 100644 index 0000000000..6d142f016b --- /dev/null +++ b/test/html4/test_html_module.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true +require "helper" + +class Nokogiri::TestCase + describe Nokogiri::HTML do + it "is the same as Nokogiri::HTML4" do + assert_same(Nokogiri::HTML, Nokogiri::HTML4) + end + end + + describe "Nokogiri.HTML()" do + it "is the same as Nokogiri.HTML4()" do + assert_equal(Nokogiri.method(:HTML), Nokogiri.method(:HTML4)) + end + end +end diff --git a/test/html/test_named_characters.rb b/test/html4/test_named_characters.rb similarity index 100% rename from test/html/test_named_characters.rb rename to test/html4/test_named_characters.rb diff --git a/test/html/test_node.rb b/test/html4/test_node.rb similarity index 100% rename from test/html/test_node.rb rename to test/html4/test_node.rb diff --git a/test/html/test_node_encoding.rb b/test/html4/test_node_encoding.rb similarity index 100% rename from test/html/test_node_encoding.rb rename to test/html4/test_node_encoding.rb