Skip to content

Commit

Permalink
Merge pull request #554 from sebastian-nagel/NUTCH-2582-set-mime-type…
Browse files Browse the repository at this point in the history
…s-reader-pool-size

NUTCH-2582 Set pool size of XML SAX parsers used for MIME detection in Tika
  • Loading branch information
sebastian-nagel authored Nov 18, 2020
2 parents 235af3c + 975452f commit c1cf6bb
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 13 deletions.
7 changes: 7 additions & 0 deletions conf/tika-config.xml.template
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,11 @@
-->
<properties>
<service-loader initializableProblemHandler="ignore" loadErrorHandler="warn" />
<!--
Set pool size of SAX parsers to a higher value if fetcher is
parsing with many threads and Tika complains about "Consider
increasing the XMLReaderUtils.POOL_SIZE". Tika's default pool
size is 10. Cf. NUTCH-2578, TIKA-2645, NUTCH-2582.
-->
<xml-reader-utils poolSize="20" />
</properties>
4 changes: 4 additions & 0 deletions src/java/org/apache/nutch/fetcher/Fetcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.NutchTool;
Expand Down Expand Up @@ -201,6 +202,9 @@ public void run(Context innerContext) throws IOException {
int threadCount = conf.getInt("fetcher.threads.fetch", 10);
LOG.info("Fetcher: threads: {}", threadCount);

// NUTCH-2582: adapt Tika MIME detector pool size to thread count
MimeUtil.setPoolSize(Math.max(10, threadCount / 2));

int timeoutDivisor = conf.getInt("fetcher.threads.timeout.divisor", 2);
LOG.info("Fetcher: time-out divisor: {}", timeoutDivisor);

Expand Down
28 changes: 15 additions & 13 deletions src/java/org/apache/nutch/util/MimeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,30 +22,24 @@
import java.lang.invoke.MethodHandles;

import org.apache.hadoop.conf.Configuration;

import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MimeType;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;

import org.apache.tika.mime.MimeTypesReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.apache.nutch.protocol.ProtocolOutput;

/**
* @author mattmann
* @since NUTCH-608
*
* <p>
* This is a facade class to insulate Nutch from its underlying Mime Type
* substrate library, <a href="http://incubator.apache.org/tika/">Apache
* Tika</a>. Any mime handling code should be placed in this utility
* class, and hidden from the Nutch classes that rely on it.
* </p>
* This is a facade class to insulate Nutch from its underlying Mime Type
* substrate library, <a href="https://tika.apache.org/">Apache Tika</a>. Any
* Mime handling code should be placed in this utility class, and hidden from
* the Nutch classes that rely on it.
*/
public final class MimeUtil {

Expand All @@ -64,6 +58,14 @@ public final class MimeUtil {
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());

public static void setPoolSize(int poolSize) {
try {
MimeTypesReader.setPoolSize(poolSize);
} catch (TikaException e) {
LOG.error("Failed to set pool size", e);
}
}

public MimeUtil(Configuration conf) {
ObjectCache objectCache = ObjectCache.get(conf);
tika = (Tika) objectCache.getObject(Tika.class.getName());
Expand Down

0 comments on commit c1cf6bb

Please sign in to comment.