From 95b37e075c11ff434d00f4512239fad199f28bac Mon Sep 17 00:00:00 2001 From: David Pilato Date: Fri, 1 Feb 2019 09:42:16 +0100 Subject: [PATCH] Warn in case of Tika error Closes #674. --- .../elasticsearch/crawler/fs/tika/TikaDocParser.java | 10 ++++++++++ .../crawler/fs/tika/TikaDocParserTest.java | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParser.java b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParser.java index 34e3fbecf..46590bbd8 100644 --- a/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParser.java +++ b/tika/src/main/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParser.java @@ -93,6 +93,16 @@ public static void generate(FsSettings fsSettings, InputStream inputStream, Stri parsedContent = extractText(fsSettings, indexedChars, inputStream, metadata); logger.trace("End of Tika extraction"); } catch (Throwable e) { + // Build a message from embedded errors + Throwable current = e; + StringBuilder sb = new StringBuilder(); + while (current != null) { + sb.append(" -> "); + sb.append(current.getMessage()); + current = current.getCause(); + } + + logger.warn("Failed to extract [" + indexedChars + "] characters of text for [" + filename + "] {}", sb.toString()); logger.debug("Failed to extract [" + indexedChars + "] characters of text for [" + filename + "]", e); } diff --git a/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java b/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java index 892f728de..086570ffc 100644 --- a/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java +++ b/tika/src/test/java/fr/pilato/elasticsearch/crawler/fs/tika/TikaDocParserTest.java @@ -697,8 +697,17 @@ public void testShiftJisEncoding() throws IOException { assertThat(doc.getContent(), not(isEmptyOrNullString())); } + /** + * Test protected document + */ + @Test + public void testProtectedDocument() throws IOException { + FsSettings fsSettings = FsSettings.builder(getCurrentTestName()).build(); + Doc doc = extractFromFile("test-protected.docx", fsSettings); + assertThat(doc.getFile().getContentType(), is("application/x-tika-ooxml-protected")); + } + private Doc extractFromFileExtension(String extension) throws IOException { - logger.info("Test extraction of [{}] file", extension); return extractFromFile("test." + extension); } @@ -707,6 +716,7 @@ private Doc extractFromFile(String filename) throws IOException { } private Doc extractFromFile(String filename, FsSettings fsSettings) throws IOException { + logger.info("Test extraction of [{}]", filename); InputStream data = getBinaryContent(filename); Doc doc = new Doc(); MessageDigest messageDigest = null;