Skip to content

Commit

Permalink
If the input file or URL is binary, throw an exception
Browse files Browse the repository at this point in the history
Prevents useless processing and apparent hangs

Fixes #1192
  • Loading branch information
jhy committed May 12, 2019
1 parent 7de614f commit 247c5d0
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ jsoup changelog
automatically set the mime boundary.
<https://github.com/jhy/jsoup/pull/1058>

* Improvement: Jsoup will now detect if an input file or URL is binary, and will refuse to attempt to parse it, with
an IO exception. This prevents runaway processing time and wasted effort creating meaningless parsed DOM trees.
<https://github.com/jhy/jsoup/issues/1192>

* Bugfix: when using the tag case preserving parsing settings, certain HTML tree building rules where not followed
for upper case tags.
<https://github.com/jhy/jsoup/issues/1149>
Expand Down
4 changes: 4 additions & 0 deletions src/main/java/org/jsoup/UncheckedIOException.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ public UncheckedIOException(IOException cause) {
super(cause);
}

public UncheckedIOException(String message) {
super(new IOException(message));
}

public IOException ioException() {
return (IOException) getCause();
}
Expand Down
6 changes: 5 additions & 1 deletion src/main/java/org/jsoup/helper/DataUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,11 @@ static Document parseInputStream(InputStream input, String charsetName, String b

if (charsetName == null) { // determine from meta. safe first parse as UTF-8
String docData = Charset.forName(defaultCharset).decode(firstBytes).toString();
doc = parser.parseInput(docData, baseUri);
try {
doc = parser.parseInput(docData, baseUri);
} catch (UncheckedIOException e) {
throw e.ioException();
}

// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
Expand Down
21 changes: 21 additions & 0 deletions src/main/java/org/jsoup/parser/CharacterReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ public CharacterReader(Reader input, int sz) {
reader = input;
charBuf = new char[sz > maxBufferLen ? maxBufferLen : sz];
bufferUp();

if (isBinary()) {
throw new UncheckedIOException("Input is binary and unsupported");
}
}

public CharacterReader(Reader input) {
Expand Down Expand Up @@ -448,6 +452,23 @@ boolean containsIgnoreCase(String seq) {
return (nextIndexOf(loScan) > -1) || (nextIndexOf(hiScan) > -1);
}

private static final int numNullsConsideredBinary = 10; // conservative

/**
* Heuristic to determine if the current buffer looks like binary content. Reader will already hopefully be
* decoded correctly, so a bunch of NULLs indicates a binary file
*/
boolean isBinary() {
int nullsSeen = 0;

for (int i = bufPos; i < bufLength; i++) {
if (charBuf[i] == '\0')
nullsSeen++;
}

return nullsSeen >= numNullsConsideredBinary;
}

@Override
public String toString() {
return new String(charBuf, bufPos, bufLength - bufPos);
Expand Down
64 changes: 63 additions & 1 deletion src/test/java/org/jsoup/integration/ConnectTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import org.jsoup.UncheckedIOException;
import org.jsoup.integration.servlets.Deflateservlet;
import org.jsoup.integration.servlets.EchoServlet;
import org.jsoup.integration.servlets.FileServlet;
import org.jsoup.integration.servlets.HelloServlet;
import org.jsoup.integration.servlets.InterruptedServlet;
import org.jsoup.integration.servlets.RedirectServlet;
Expand Down Expand Up @@ -418,7 +419,7 @@ public void handlesEmptyStreamDuringParseRead() throws IOException {
}

@Test
public void handlesEmtpyStreamDuringBufferdRead() throws IOException {
public void handlesEmtpyStreamDuringBufferedRead() throws IOException {
Connection.Response res = Jsoup.connect(InterruptedServlet.Url)
.timeout(200)
.execute();
Expand Down Expand Up @@ -477,4 +478,65 @@ public void handlesEmtpyStreamDuringBufferdRead() throws IOException {
assertEquals("POST", ihVal("Method", doc));
assertEquals("there", ihVal("Hello", doc));
}

@Test public void getUtf8Bom() throws IOException {
Connection con = Jsoup.connect(FileServlet.Url);
con.data(FileServlet.LocationParam, "/bomtests/bom_utf8.html");
Document doc = con.get();

assertEquals("UTF-8", con.response().charset());
assertEquals("OK", doc.title());
}

@Test
public void testBinaryThrowsExceptionWhenTypeIgnored() {
Connection con = Jsoup.connect(FileServlet.Url);
con.data(FileServlet.LocationParam, "/htmltests/thumb.jpg");
con.data(FileServlet.ContentTypeParam, "image/jpeg");
con.ignoreContentType(true);

boolean threw = false;
try {
con.execute();
Document doc = con.response().parse();
} catch (IOException e) {
threw = true;
assertEquals("Input is binary and unsupported", e.getMessage());
}
assertTrue(threw);
}

@Test
public void testBinaryResultThrows() {
Connection con = Jsoup.connect(FileServlet.Url);
con.data(FileServlet.LocationParam, "/htmltests/thumb.jpg");
con.data(FileServlet.ContentTypeParam, "text/html");

boolean threw = false;
try {
con.execute();
Document doc = con.response().parse();
} catch (IOException e) {
threw = true;
assertEquals("Input is binary and unsupported", e.getMessage());
}
assertTrue(threw);
}

@Test
public void testBinaryContentTypeThrowsException() {
Connection con = Jsoup.connect(FileServlet.Url);
con.data(FileServlet.LocationParam, "/htmltests/thumb.jpg");
con.data(FileServlet.ContentTypeParam, "image/jpeg");

boolean threw = false;
try {
con.execute();
Document doc = con.response().parse();
} catch (IOException e) {
threw = true;
assertEquals("Unhandled content type. Must be text/*, application/xml, or application/xhtml+xml", e.getMessage());
}
assertTrue(threw);
}
}
14 changes: 10 additions & 4 deletions src/test/java/org/jsoup/integration/ParseTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,17 @@ public void testGoogleSearchIpod() throws IOException {
}

@Test
public void testBinary() throws IOException {
public void testBinaryThrowsException() throws IOException {
File in = getFile("/htmltests/thumb.jpg");
Document doc = Jsoup.parse(in, "UTF-8");
// nothing useful, but did not blow up
assertTrue(doc.text().contains("gd-jpeg"));

boolean threw = false;
try {
Document doc = Jsoup.parse(in, "UTF-8");
} catch (IOException e) {
threw = true;
assertEquals("Input is binary and unsupported", e.getMessage());
}
assertTrue(threw);
}

@Test
Expand Down
45 changes: 45 additions & 0 deletions src/test/java/org/jsoup/integration/servlets/FileServlet.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
package org.jsoup.integration.servlets;

import org.jsoup.integration.ParseTest;
import org.jsoup.integration.TestServer;

import javax.servlet.ServletException;
import javax.servlet.ServletOutputStream;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;

public class FileServlet extends BaseServlet {
public static final String Url = TestServer.map(FileServlet.class);
public static final String ContentTypeParam = "contentType";
public static final String LocationParam = "loc";
public static final String DefaultType = "text/html";

@Override
protected void doGet(HttpServletRequest req, HttpServletResponse res) throws IOException {
String contentType = req.getParameter(ContentTypeParam);
if (contentType == null)
contentType = DefaultType;
String location = req.getParameter(LocationParam);

File file = ParseTest.getFile(location);
if (file.exists()) {
res.setContentType(contentType);
res.setStatus(HttpServletResponse.SC_OK);

ServletOutputStream out = res.getOutputStream();
Files.copy(file.toPath(), out);
out.flush();
} else {
res.setStatus(HttpServletResponse.SC_NOT_FOUND);
}
}

@Override
protected void doPost(HttpServletRequest req, HttpServletResponse res) throws ServletException, IOException {
doGet(req, res);
}
}

0 comments on commit 247c5d0

Please sign in to comment.