From a1fc59bb0b158b279615378c14ec5886647ad006 Mon Sep 17 00:00:00 2001 From: Anton Konovalov Date: Mon, 26 Apr 2021 17:21:18 +0300 Subject: [PATCH] Make possible parsing and cleaning entire html document --- src/main/java/org/jsoup/Jsoup.java | 32 ++++++++++++------- src/main/java/org/jsoup/parser/Parser.java | 29 ++++++++++++----- src/main/java/org/jsoup/safety/Cleaner.java | 22 ++++++++++--- src/main/java/org/jsoup/safety/Safelist.java | 16 +++++++--- .../java/org/jsoup/safety/CleanerTest.java | 19 ++++++++--- 5 files changed, 86 insertions(+), 32 deletions(-) diff --git a/src/main/java/org/jsoup/Jsoup.java b/src/main/java/org/jsoup/Jsoup.java index a8f1f44726..e70b3dc0eb 100644 --- a/src/main/java/org/jsoup/Jsoup.java +++ b/src/main/java/org/jsoup/Jsoup.java @@ -1,5 +1,12 @@ package org.jsoup; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.URL; + +import javax.annotation.Nullable; + import org.jsoup.helper.DataUtil; import org.jsoup.helper.HttpConnection; import org.jsoup.nodes.Document; @@ -8,12 +15,6 @@ import org.jsoup.safety.Safelist; import org.jsoup.safety.Whitelist; -import javax.annotation.Nullable; -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; - /** The core public access point to the jsoup functionality. @@ -215,6 +216,14 @@ public static Document parse(URL url, int timeoutMillis) throws IOException { return con.get(); } + /** + * {@code fullHtml} defaults to false + * @see Jsoup#clean(String, String, Safelist, boolean) + */ + public static String clean(String bodyHtml, String baseUri, Safelist safelist) { + return clean(bodyHtml, baseUri, safelist, false); + } + /** Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe tags and attributes. @@ -222,15 +231,16 @@ public static Document parse(URL url, int timeoutMillis) throws IOException { @param bodyHtml input untrusted HTML (body fragment) @param baseUri URL to resolve relative URLs against @param safelist list of permitted HTML elements - @return safe HTML (body fragment) + @param fullHtml is full html is used + @return safe HTML @see Cleaner#clean(Document) */ - public static String clean(String bodyHtml, String baseUri, Safelist safelist) { - Document dirty = parseBodyFragment(bodyHtml, baseUri); + public static String clean(String bodyHtml, String baseUri, Safelist safelist, boolean fullHtml) { + Document dirty = Parser.parseBodyFragment(bodyHtml, baseUri, fullHtml); Cleaner cleaner = new Cleaner(safelist); - Document clean = cleaner.clean(dirty); - return clean.body().html(); + Document clean = cleaner.clean(dirty, fullHtml); + return fullHtml ? clean.html() : clean.body().html(); } /** diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java index 8ff9667e44..6285bf6fb8 100644 --- a/src/main/java/org/jsoup/parser/Parser.java +++ b/src/main/java/org/jsoup/parser/Parser.java @@ -1,13 +1,13 @@ package org.jsoup.parser; -import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; -import org.jsoup.nodes.Node; - import java.io.Reader; import java.io.StringReader; import java.util.List; +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; + /** * Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods * in {@link org.jsoup.Jsoup}. @@ -174,24 +174,37 @@ public static List parseXmlFragment(String fragmentXml, String baseUri) { return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder)); } + /** + * {@code fullHtml} defaults to false + * @see Parser#parseBodyFragment(String, String, boolean) + */ + public static Document parseBodyFragment(String bodyHtml, String baseUri) { + return parseBodyFragment(bodyHtml, baseUri, false); + } + /** * Parse a fragment of HTML into the {@code body} of a Document. * * @param bodyHtml fragment of HTML * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs. + * @param fullHtml is the full html used * * @return Document, with empty head, and HTML parsed into body */ - public static Document parseBodyFragment(String bodyHtml, String baseUri) { + public static Document parseBodyFragment(String bodyHtml, String baseUri, boolean fullHtml) { Document doc = Document.createShell(baseUri); - Element body = doc.body(); - List nodeList = parseFragment(bodyHtml, body, baseUri); + Element root = fullHtml ? doc.body().parent() : doc.body(); + if (fullHtml) { + assert root != null; + root.empty(); + } + List nodeList = parseFragment(bodyHtml, root, baseUri); Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented for (int i = nodes.length - 1; i > 0; i--) { nodes[i].remove(); } for (Node node : nodes) { - body.appendChild(node); + root.appendChild(node); } return doc; } diff --git a/src/main/java/org/jsoup/safety/Cleaner.java b/src/main/java/org/jsoup/safety/Cleaner.java index bbe44ce458..857da8362c 100644 --- a/src/main/java/org/jsoup/safety/Cleaner.java +++ b/src/main/java/org/jsoup/safety/Cleaner.java @@ -1,5 +1,7 @@ package org.jsoup.safety; +import java.util.List; + import org.jsoup.helper.Validate; import org.jsoup.nodes.Attribute; import org.jsoup.nodes.Attributes; @@ -14,8 +16,6 @@ import org.jsoup.select.NodeTraversor; import org.jsoup.select.NodeVisitor; -import java.util.List; - /** The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes @@ -54,18 +54,32 @@ public Cleaner(Whitelist whitelist) { this.safelist = whitelist; } + /** + * {@code empty} defaults to false + * @see Cleaner#clean(Document, boolean) + */ + public Document clean(Document dirtyDocument) { + return clean(dirtyDocument, false); + } + /** Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist. The original document is not modified. Only elements from the dirty document's body are used. The OutputSettings of the original document are cloned into the clean document. @param dirtyDocument Untrusted base document to clean. + @param fullHtml If full html is used. @return cleaned document. */ - public Document clean(Document dirtyDocument) { + public Document clean(Document dirtyDocument, boolean fullHtml) { Validate.notNull(dirtyDocument); Document clean = Document.createShell(dirtyDocument.baseUri()); - copySafeNodes(dirtyDocument.body(), clean.body()); + if (fullHtml) { + clean.empty(); + copySafeNodes(dirtyDocument, clean); + } else { + copySafeNodes(dirtyDocument.body(), clean.body()); + } clean.outputSettings(dirtyDocument.outputSettings().clone()); return clean; diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java index 76d56d2be2..854ecf4cf9 100644 --- a/src/main/java/org/jsoup/safety/Safelist.java +++ b/src/main/java/org/jsoup/safety/Safelist.java @@ -5,17 +5,17 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/ this safe-list configuration, and the initial defaults. */ -import org.jsoup.helper.Validate; -import org.jsoup.nodes.Attribute; -import org.jsoup.nodes.Attributes; -import org.jsoup.nodes.Element; +import static org.jsoup.internal.Normalizer.lowerCase; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; -import static org.jsoup.internal.Normalizer.lowerCase; +import org.jsoup.helper.Validate; +import org.jsoup.nodes.Attribute; +import org.jsoup.nodes.Attributes; +import org.jsoup.nodes.Element; /** @@ -180,6 +180,12 @@ public static Safelist relaxed() { ; } + public static Safelist full() { + return Safelist.relaxed() + .addTags("html", "body", "head", "meta", "style", "title") + .addAttributes("meta", "charset"); + } + /** Create a new, empty safelist. Generally it will be better to start with a default prepared safelist instead. diff --git a/src/test/java/org/jsoup/safety/CleanerTest.java b/src/test/java/org/jsoup/safety/CleanerTest.java index 33380549c6..84e6d2e7d5 100644 --- a/src/test/java/org/jsoup/safety/CleanerTest.java +++ b/src/test/java/org/jsoup/safety/CleanerTest.java @@ -1,5 +1,14 @@ package org.jsoup.safety; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Locale; + import org.jsoup.Jsoup; import org.jsoup.MultiLocaleExtension.MultiLocaleTest; import org.jsoup.TextUtil; @@ -7,10 +16,6 @@ import org.jsoup.nodes.Entities; import org.junit.jupiter.api.Test; -import java.util.Locale; - -import static org.junit.jupiter.api.Assertions.*; - /** Tests for the cleaner. @@ -50,6 +55,12 @@ public class CleanerTest { assertEquals("

Head

OneTwo
", TextUtil.stripNewlines(cleanHtml)); } + @Test public void testFull() { + String h = "title1"; + String cleanHtml = Jsoup.clean(h, "", Safelist.full(), true); + assertEquals("title1", TextUtil.stripNewlines(cleanHtml)); + } + @Test public void testRemoveTags() { String h = "

Nice

Hello
"; String cleanHtml = Jsoup.clean(h, Safelist.basic().removeTags("a"));