From 29cf4f20e81f1b1e771bc701f554a6f313dd0b4d Mon Sep 17 00:00:00 2001
From: Oliver Kopp <kopp.dev@gmail.com>
Date: Sat, 24 Aug 2019 17:57:53 +0200
Subject: [PATCH] Switch from tika-parsers to tika-core (#5217)

---
 build.gradle                                  |  2 +-
 ...fully-support-utf8-only-for-latex-files.md | 44 +++++++++++++++++++
 docs/adr/index.md                             |  1 +
 .../logic/texparser/DefaultTexParser.java     |  7 ++-
 .../logic/texparser/DefaultTexParserTest.java | 21 ++++++---
 5 files changed, 66 insertions(+), 9 deletions(-)
 create mode 100644 docs/adr/0005-fully-support-utf8-only-for-latex-files.md

diff --git a/build.gradle b/build.gradle
index 6ce97a88ba8..244feb633dc 100644
--- a/build.gradle
+++ b/build.gradle
@@ -94,7 +94,7 @@ dependencies {
     compile 'org.apache.pdfbox:fontbox:2.0.16'
     compile 'org.apache.pdfbox:xmpbox:2.0.16'
 
-    compile group: 'org.apache.tika', name: 'tika-parsers', version: '1.22'
+    compile group: 'org.apache.tika', name: 'tika-core', version: '1.22'
 
     // required for reading write-protected PDFs - see https://github.com/JabRef/jabref/pull/942#issuecomment-209252635
     compile 'org.bouncycastle:bcprov-jdk15on:1.62'
diff --git a/docs/adr/0005-fully-support-utf8-only-for-latex-files.md b/docs/adr/0005-fully-support-utf8-only-for-latex-files.md
new file mode 100644
index 00000000000..6e066e8b7d8
--- /dev/null
+++ b/docs/adr/0005-fully-support-utf8-only-for-latex-files.md
@@ -0,0 +1,44 @@
+# Fully Support UTF-8 Only For LaTeX Files
+
+## Context and Problem Statement
+
+The feature [search for citations](https://github.com/JabRef/help.jabref.org/issues/210) displays the content of LaTeX files.
+The LaTeX files are text files and might be encoded arbitrarily.
+
+## Considered Options
+
+* Support UTF-8 encoding only
+* Support ASCII encoding only
+* Support (nearly) all encodings
+
+## Decision Outcome
+
+Chosen option: "Support UTF-8 encoding only", because comes out best (see below).
+
+### Positive Consequences
+
+* All content of LaTeX files are displayed in JabRef
+
+### Negative Consequences
+
+* When a LaTeX files is encoded in another encoding, the user might see strange characters in JabRef
+
+## Pros and Cons of the Options
+
+### Support UTF-8 encoding only
+
+* Good, because covers most tex file encodings
+* Good, because easy to implement
+* Bad, because does not support encodings used before around 2010
+
+### Support ASCII encoding only
+
+* Good, because easy to implement
+* Bad, because does not support any encoding at all
+
+### Support (nearly) all encodings
+
+* Good, because easy to implement
+* Bad, because it relies on Apache Tika's `CharsetDetector`, which resides in `tika-parsers`.
+  This causes issues during compilation (see https://github.com/JabRef/jabref/pull/3421#issuecomment-524532832).
+  Example: `error: module java.xml.bind reads package javax.activation from both java.activation and jakarta.activation`.
diff --git a/docs/adr/index.md b/docs/adr/index.md
index 7bbc5b8c4fc..de1818aea3c 100644
--- a/docs/adr/index.md
+++ b/docs/adr/index.md
@@ -9,6 +9,7 @@ This log lists the architectural decisions for JabRef.
 - [ADR-0002](0002-use-slf4j-for-logging.md) - Use slf4j together with log4j2 for logging
 - [ADR-0003](0003-use-gradle-as-build-tool.md) - Use Gradle as build tool
 - [ADR-0004](0004-use-mariadb-connector.md) - Use MariaDB Connector
+- [ADR-0005](0005-fully-support-utf8-only-for-latex-files.md) - Fully Support UTF-8 Only For LaTeX Files
 
 <!-- adrlogstop -->
 
diff --git a/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java b/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java
index 0d6414892ef..de56646261e 100644
--- a/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java
+++ b/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java
@@ -1,10 +1,13 @@
 package org.jabref.logic.texparser;
 
 import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
 import java.io.UncheckedIOException;
 import java.nio.channels.ClosedChannelException;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@@ -18,7 +21,6 @@
 import org.jabref.model.texparser.TexParser;
 import org.jabref.model.texparser.TexParserResult;
 
-import org.apache.tika.parser.txt.CharsetDetector;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -78,7 +80,8 @@ public TexParserResult parse(List<Path> texFiles) {
             }
 
             try (
-                    Reader reader = new CharsetDetector().setText(Files.readAllBytes(file)).detect().getReader();
+                    InputStream inputStream = Files.newInputStream(file);
+                    Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
                     LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
                 for (String line = lineNumberReader.readLine(); line != null; line = lineNumberReader.readLine()) {
                     // Skip comments and blank lines.
diff --git a/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java b/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java
index a0638550877..1d4047a577f 100644
--- a/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java
+++ b/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java
@@ -12,6 +12,7 @@
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 public class DefaultTexParserTest {
+
     private final static String DARWIN = "Darwin1888";
     private final static String EINSTEIN = "Einstein1920";
     private final static String NEWTON = "Newton1999";
@@ -90,7 +91,9 @@ public void testFileEncodingIso88591() throws URISyntaxException {
         TexParserResult expectedParserResult = new TexParserResult();
 
         expectedParserResult.getFileList().add(texFile);
-        expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
+        // The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005
+        expectedParserResult
+                .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
 
         assertEquals(expectedParserResult, parserResult);
     }
@@ -103,7 +106,9 @@ public void testFileEncodingIso885915() throws URISyntaxException {
         TexParserResult expectedParserResult = new TexParserResult();
 
         expectedParserResult.getFileList().add(texFile);
-        expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
+        // The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005
+        expectedParserResult
+                .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
 
         assertEquals(expectedParserResult, parserResult);
     }
@@ -114,13 +119,17 @@ public void testFileEncodingForThreeFiles() throws URISyntaxException {
         Path texFile2 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-1.tex").toURI());
         Path texFile3 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-15.tex").toURI());
 
-        TexParserResult parserResult = new DefaultTexParser().parse(Arrays.asList(texFile, texFile2, texFile3));
+        TexParserResult parserResult = new DefaultTexParser()
+                .parse(Arrays.asList(texFile, texFile2, texFile3));
         TexParserResult expectedParserResult = new TexParserResult();
 
         expectedParserResult.getFileList().addAll(Arrays.asList(texFile, texFile2, texFile3));
-        expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
-        expectedParserResult.addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
-        expectedParserResult.addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
+        expectedParserResult
+                .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}.");
+        expectedParserResult
+                .addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
+        expectedParserResult
+                .addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}.");
 
         assertEquals(expectedParserResult, parserResult);
     }