Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make possible parsing and cleaning entire html document #1530

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 21 additions & 11 deletions src/main/java/org/jsoup/Jsoup.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,12 @@
package org.jsoup;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;

import javax.annotation.Nullable;

import org.jsoup.helper.DataUtil;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
Expand All @@ -8,12 +15,6 @@
import org.jsoup.safety.Safelist;
import org.jsoup.safety.Whitelist;

import javax.annotation.Nullable;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;

/**
The core public access point to the jsoup functionality.

Expand Down Expand Up @@ -215,22 +216,31 @@ public static Document parse(URL url, int timeoutMillis) throws IOException {
return con.get();
}

/**
* {@code fullHtml} defaults to false
* @see Jsoup#clean(String, String, Safelist, boolean)
*/
public static String clean(String bodyHtml, String baseUri, Safelist safelist) {
return clean(bodyHtml, baseUri, safelist, false);
}

/**
Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe
tags and attributes.

@param bodyHtml input untrusted HTML (body fragment)
@param baseUri URL to resolve relative URLs against
@param safelist list of permitted HTML elements
@return safe HTML (body fragment)
@param fullHtml is full html is used
@return safe HTML

@see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, String baseUri, Safelist safelist) {
Document dirty = parseBodyFragment(bodyHtml, baseUri);
public static String clean(String bodyHtml, String baseUri, Safelist safelist, boolean fullHtml) {
Document dirty = Parser.parseBodyFragment(bodyHtml, baseUri, fullHtml);
Cleaner cleaner = new Cleaner(safelist);
Document clean = cleaner.clean(dirty);
return clean.body().html();
Document clean = cleaner.clean(dirty, fullHtml);
return fullHtml ? clean.html() : clean.body().html();
}

/**
Expand Down
29 changes: 21 additions & 8 deletions src/main/java/org/jsoup/parser/Parser.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package org.jsoup.parser;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

import java.io.Reader;
import java.io.StringReader;
import java.util.List;

import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;

/**
* Parses HTML into a {@link org.jsoup.nodes.Document}. Generally best to use one of the more convenient parse methods
* in {@link org.jsoup.Jsoup}.
Expand Down Expand Up @@ -174,24 +174,37 @@ public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder));
}

/**
* {@code fullHtml} defaults to false
* @see Parser#parseBodyFragment(String, String, boolean)
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
return parseBodyFragment(bodyHtml, baseUri, false);
}

/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
* @param fullHtml is the full html used
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
public static Document parseBodyFragment(String bodyHtml, String baseUri, boolean fullHtml) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
Element root = fullHtml ? doc.body().parent() : doc.body();
if (fullHtml) {
assert root != null;
root.empty();
}
List<Node> nodeList = parseFragment(bodyHtml, root, baseUri);
Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented
for (int i = nodes.length - 1; i > 0; i--) {
nodes[i].remove();
}
for (Node node : nodes) {
body.appendChild(node);
root.appendChild(node);
}
return doc;
}
Expand Down
22 changes: 18 additions & 4 deletions src/main/java/org/jsoup/safety/Cleaner.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package org.jsoup.safety;

import java.util.List;

import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
Expand All @@ -14,8 +16,6 @@
import org.jsoup.select.NodeTraversor;
import org.jsoup.select.NodeVisitor;

import java.util.List;


/**
The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
Expand Down Expand Up @@ -54,18 +54,32 @@ public Cleaner(Whitelist whitelist) {
this.safelist = whitelist;
}

/**
* {@code empty} defaults to false
* @see Cleaner#clean(Document, boolean)
*/
public Document clean(Document dirtyDocument) {
return clean(dirtyDocument, false);
}

/**
Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist.
The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The
OutputSettings of the original document are cloned into the clean document.
@param dirtyDocument Untrusted base document to clean.
@param fullHtml If full html is used.
@return cleaned document.
*/
public Document clean(Document dirtyDocument) {
public Document clean(Document dirtyDocument, boolean fullHtml) {
Validate.notNull(dirtyDocument);

Document clean = Document.createShell(dirtyDocument.baseUri());
copySafeNodes(dirtyDocument.body(), clean.body());
if (fullHtml) {
clean.empty();
copySafeNodes(dirtyDocument, clean);
} else {
copySafeNodes(dirtyDocument.body(), clean.body());
}
clean.outputSettings(dirtyDocument.outputSettings().clone());

return clean;
Expand Down
16 changes: 11 additions & 5 deletions src/main/java/org/jsoup/safety/Safelist.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,17 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/
this safe-list configuration, and the initial defaults.
*/

import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;
import static org.jsoup.internal.Normalizer.lowerCase;

import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;

import static org.jsoup.internal.Normalizer.lowerCase;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
import org.jsoup.nodes.Element;


/**
Expand Down Expand Up @@ -180,6 +180,12 @@ public static Safelist relaxed() {
;
}

public static Safelist full() {
return Safelist.relaxed()
.addTags("html", "body", "head", "meta", "style", "title")
.addAttributes("meta", "charset");
}

/**
Create a new, empty safelist. Generally it will be better to start with a default prepared safelist instead.

Expand Down
19 changes: 15 additions & 4 deletions src/test/java/org/jsoup/safety/CleanerTest.java
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
package org.jsoup.safety;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNotSame;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;

import java.util.Locale;

import org.jsoup.Jsoup;
import org.jsoup.MultiLocaleExtension.MultiLocaleTest;
import org.jsoup.TextUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Entities;
import org.junit.jupiter.api.Test;

import java.util.Locale;

import static org.junit.jupiter.api.Assertions.*;

/**
Tests for the cleaner.

Expand Down Expand Up @@ -50,6 +55,12 @@ public class CleanerTest {
assertEquals("<h1>Head</h1><table><tbody><tr><td>One</td><td>Two</td></tr></tbody></table>", TextUtil.stripNewlines(cleanHtml));
}

@Test public void testFull() {
String h = "<html><head><title>title</title><meta charset=\"UTF-8\"><script></script><style>.clss{color:red;}</style></head><body>1<script></script></body></html>";
String cleanHtml = Jsoup.clean(h, "", Safelist.full(), true);
assertEquals("<html><head><title>title</title><meta charset=\"UTF-8\"><style>.clss{color:red;}</style></head><body>1</body></html>", TextUtil.stripNewlines(cleanHtml));
}

@Test public void testRemoveTags() {
String h = "<div><p><A HREF='HTTP://nice.com'>Nice</a></p><blockquote>Hello</blockquote>";
String cleanHtml = Jsoup.clean(h, Safelist.basic().removeTags("a"));
Expand Down