From 975452f7ac0b60f04d79b10477a4744dfe1aa673 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 16 Oct 2020 23:10:03 +0200 Subject: [PATCH] NUTCH-2582 Set pool size of XML SAX parsers used for MIME detection in Tika - add method in MimeUtil to set MimeTypesReader pool size - actually adjust pool size to number of Fetcher threads / 2 (minimum pool size is 10 in case there are less than 20 Fetcher threads) - double pool size (10 -> 20) of Tika XMLReaderUtils in tika-config.xml --- conf/tika-config.xml.template | 7 +++++ .../org/apache/nutch/fetcher/Fetcher.java | 4 +++ src/java/org/apache/nutch/util/MimeUtil.java | 28 ++++++++++--------- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/conf/tika-config.xml.template b/conf/tika-config.xml.template index 571a60632c..35f635e4e6 100644 --- a/conf/tika-config.xml.template +++ b/conf/tika-config.xml.template @@ -17,4 +17,11 @@ --> + + diff --git a/src/java/org/apache/nutch/fetcher/Fetcher.java b/src/java/org/apache/nutch/fetcher/Fetcher.java index 687411eceb..6d4c195a7b 100644 --- a/src/java/org/apache/nutch/fetcher/Fetcher.java +++ b/src/java/org/apache/nutch/fetcher/Fetcher.java @@ -46,6 +46,7 @@ import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.metadata.Nutch; +import org.apache.nutch.util.MimeUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.NutchTool; @@ -201,6 +202,9 @@ public void run(Context innerContext) throws IOException { int threadCount = conf.getInt("fetcher.threads.fetch", 10); LOG.info("Fetcher: threads: {}", threadCount); + // NUTCH-2582: adapt Tika MIME detector pool size to thread count + MimeUtil.setPoolSize(Math.max(10, threadCount / 2)); + int timeoutDivisor = conf.getInt("fetcher.threads.timeout.divisor", 2); LOG.info("Fetcher: time-out divisor: {}", timeoutDivisor); diff --git a/src/java/org/apache/nutch/util/MimeUtil.java b/src/java/org/apache/nutch/util/MimeUtil.java index 17bb380b7f..2cc0d142cf 100644 --- a/src/java/org/apache/nutch/util/MimeUtil.java +++ b/src/java/org/apache/nutch/util/MimeUtil.java @@ -22,30 +22,24 @@ import java.lang.invoke.MethodHandles; import org.apache.hadoop.conf.Configuration; - +import org.apache.nutch.protocol.ProtocolOutput; import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; import org.apache.tika.mime.MimeTypes; import org.apache.tika.mime.MimeTypesFactory; - +import org.apache.tika.mime.MimeTypesReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.nutch.protocol.ProtocolOutput; - /** - * @author mattmann - * @since NUTCH-608 - * - *

- * This is a facade class to insulate Nutch from its underlying Mime Type - * substrate library, Apache - * Tika. Any mime handling code should be placed in this utility - * class, and hidden from the Nutch classes that rely on it. - *

+ * This is a facade class to insulate Nutch from its underlying Mime Type + * substrate library, Apache Tika. Any + * Mime handling code should be placed in this utility class, and hidden from + * the Nutch classes that rely on it. */ public final class MimeUtil { @@ -64,6 +58,14 @@ public final class MimeUtil { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); + public static void setPoolSize(int poolSize) { + try { + MimeTypesReader.setPoolSize(poolSize); + } catch (TikaException e) { + LOG.error("Failed to set pool size", e); + } + } + public MimeUtil(Configuration conf) { ObjectCache objectCache = ObjectCache.get(conf); tika = (Tika) objectCache.getObject(Tika.class.getName());