From 29cf4f20e81f1b1e771bc701f554a6f313dd0b4d Mon Sep 17 00:00:00 2001 From: Oliver Kopp Date: Sat, 24 Aug 2019 17:57:53 +0200 Subject: [PATCH] Switch from tika-parsers to tika-core (#5217) --- build.gradle | 2 +- ...fully-support-utf8-only-for-latex-files.md | 44 +++++++++++++++++++ docs/adr/index.md | 1 + .../logic/texparser/DefaultTexParser.java | 7 ++- .../logic/texparser/DefaultTexParserTest.java | 21 ++++++--- 5 files changed, 66 insertions(+), 9 deletions(-) create mode 100644 docs/adr/0005-fully-support-utf8-only-for-latex-files.md diff --git a/build.gradle b/build.gradle index 6ce97a88ba8..244feb633dc 100644 --- a/build.gradle +++ b/build.gradle @@ -94,7 +94,7 @@ dependencies { compile 'org.apache.pdfbox:fontbox:2.0.16' compile 'org.apache.pdfbox:xmpbox:2.0.16' - compile group: 'org.apache.tika', name: 'tika-parsers', version: '1.22' + compile group: 'org.apache.tika', name: 'tika-core', version: '1.22' // required for reading write-protected PDFs - see https://github.com/JabRef/jabref/pull/942#issuecomment-209252635 compile 'org.bouncycastle:bcprov-jdk15on:1.62' diff --git a/docs/adr/0005-fully-support-utf8-only-for-latex-files.md b/docs/adr/0005-fully-support-utf8-only-for-latex-files.md new file mode 100644 index 00000000000..6e066e8b7d8 --- /dev/null +++ b/docs/adr/0005-fully-support-utf8-only-for-latex-files.md @@ -0,0 +1,44 @@ +# Fully Support UTF-8 Only For LaTeX Files + +## Context and Problem Statement + +The feature [search for citations](https://github.com/JabRef/help.jabref.org/issues/210) displays the content of LaTeX files. +The LaTeX files are text files and might be encoded arbitrarily. + +## Considered Options + +* Support UTF-8 encoding only +* Support ASCII encoding only +* Support (nearly) all encodings + +## Decision Outcome + +Chosen option: "Support UTF-8 encoding only", because comes out best (see below). + +### Positive Consequences + +* All content of LaTeX files are displayed in JabRef + +### Negative Consequences + +* When a LaTeX files is encoded in another encoding, the user might see strange characters in JabRef + +## Pros and Cons of the Options + +### Support UTF-8 encoding only + +* Good, because covers most tex file encodings +* Good, because easy to implement +* Bad, because does not support encodings used before around 2010 + +### Support ASCII encoding only + +* Good, because easy to implement +* Bad, because does not support any encoding at all + +### Support (nearly) all encodings + +* Good, because easy to implement +* Bad, because it relies on Apache Tika's `CharsetDetector`, which resides in `tika-parsers`. + This causes issues during compilation (see https://github.com/JabRef/jabref/pull/3421#issuecomment-524532832). + Example: `error: module java.xml.bind reads package javax.activation from both java.activation and jakarta.activation`. diff --git a/docs/adr/index.md b/docs/adr/index.md index 7bbc5b8c4fc..de1818aea3c 100644 --- a/docs/adr/index.md +++ b/docs/adr/index.md @@ -9,6 +9,7 @@ This log lists the architectural decisions for JabRef. - [ADR-0002](0002-use-slf4j-for-logging.md) - Use slf4j together with log4j2 for logging - [ADR-0003](0003-use-gradle-as-build-tool.md) - Use Gradle as build tool - [ADR-0004](0004-use-mariadb-connector.md) - Use MariaDB Connector +- [ADR-0005](0005-fully-support-utf8-only-for-latex-files.md) - Fully Support UTF-8 Only For LaTeX Files diff --git a/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java b/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java index 0d6414892ef..de56646261e 100644 --- a/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java +++ b/src/main/java/org/jabref/logic/texparser/DefaultTexParser.java @@ -1,10 +1,13 @@ package org.jabref.logic.texparser; import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Reader; import java.io.UncheckedIOException; import java.nio.channels.ClosedChannelException; +import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -18,7 +21,6 @@ import org.jabref.model.texparser.TexParser; import org.jabref.model.texparser.TexParserResult; -import org.apache.tika.parser.txt.CharsetDetector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -78,7 +80,8 @@ public TexParserResult parse(List texFiles) { } try ( - Reader reader = new CharsetDetector().setText(Files.readAllBytes(file)).detect().getReader(); + InputStream inputStream = Files.newInputStream(file); + Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8); LineNumberReader lineNumberReader = new LineNumberReader(reader)) { for (String line = lineNumberReader.readLine(); line != null; line = lineNumberReader.readLine()) { // Skip comments and blank lines. diff --git a/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java b/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java index a0638550877..1d4047a577f 100644 --- a/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java +++ b/src/test/java/org/jabref/logic/texparser/DefaultTexParserTest.java @@ -12,6 +12,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; public class DefaultTexParserTest { + private final static String DARWIN = "Darwin1888"; private final static String EINSTEIN = "Einstein1920"; private final static String NEWTON = "Newton1999"; @@ -90,7 +91,9 @@ public void testFileEncodingIso88591() throws URISyntaxException { TexParserResult expectedParserResult = new TexParserResult(); expectedParserResult.getFileList().add(texFile); - expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); + // The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005 + expectedParserResult + .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}."); assertEquals(expectedParserResult, parserResult); } @@ -103,7 +106,9 @@ public void testFileEncodingIso885915() throws URISyntaxException { TexParserResult expectedParserResult = new TexParserResult(); expectedParserResult.getFileList().add(texFile); - expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); + // The character � is on purpose - we cannot use Apache Tika's CharsetDetector - see ADR-0005 + expectedParserResult + .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}."); assertEquals(expectedParserResult, parserResult); } @@ -114,13 +119,17 @@ public void testFileEncodingForThreeFiles() throws URISyntaxException { Path texFile2 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-1.tex").toURI()); Path texFile3 = Paths.get(DefaultTexParserTest.class.getResource("iso-8859-15.tex").toURI()); - TexParserResult parserResult = new DefaultTexParser().parse(Arrays.asList(texFile, texFile2, texFile3)); + TexParserResult parserResult = new DefaultTexParser() + .parse(Arrays.asList(texFile, texFile2, texFile3)); TexParserResult expectedParserResult = new TexParserResult(); expectedParserResult.getFileList().addAll(Arrays.asList(texFile, texFile2, texFile3)); - expectedParserResult.addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); - expectedParserResult.addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); - expectedParserResult.addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); + expectedParserResult + .addKey("anykey", texFile, 1, 32, 45, "Danach wir anschließend mittels \\cite{anykey}."); + expectedParserResult + .addKey("anykey", texFile2, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}."); + expectedParserResult + .addKey("anykey", texFile3, 1, 32, 45, "Danach wir anschlie�end mittels \\cite{anykey}."); assertEquals(expectedParserResult, parserResult); }