Merge pull request #1026 from jplag/feature/file-encoding

Feature/file encoding
jplag · Apr 26, 2023 · ba322d9 · ba322d9
2 parents 75193f6 + 67f2cc7
commit ba322d9
Show file tree

Hide file tree

Showing 22 changed files with 258 additions and 30 deletions.
diff --git a/core/src/main/java/de/jplag/options/JPlagOptions.java b/core/src/main/java/de/jplag/options/JPlagOptions.java
@@ -2,7 +2,6 @@
 
 import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileReader;
 import java.io.IOException;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
@@ -20,6 +19,7 @@
 import de.jplag.Language;
 import de.jplag.clustering.ClusteringOptions;
 import de.jplag.exceptions.BasecodeException;
+import de.jplag.util.FileUtils;
 
 /**
  * This record defines the options to configure {@link JPlag}.
@@ -184,7 +184,7 @@ public Integer minimumTokenMatch() {
     }
 
     private Set<String> readExclusionFile(final String exclusionFileName) {
-        try (BufferedReader reader = new BufferedReader(new FileReader(exclusionFileName, JPlagOptions.CHARSET))) {
+        try (BufferedReader reader = FileUtils.openFileReader(new File(exclusionFileName))) {
             final var excludedFileNames = reader.lines().collect(Collectors.toSet());
             if (logger.isDebugEnabled()) {
                 logger.debug("Excluded files:{}{}", System.lineSeparator(), String.join(System.lineSeparator(), excludedFileNames));

diff --git a/language-api/pom.xml b/language-api/pom.xml
@@ -15,5 +15,10 @@
             <groupId>org.kohsuke.metainf-services</groupId>
             <artifactId>metainf-services</artifactId>
         </dependency>
+        <dependency>
+            <groupId>com.ibm.icu</groupId>
+            <artifactId>icu4j-charset</artifactId>
+            <version>68.1</version>
+        </dependency>
     </dependencies>
 </project>
diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java
@@ -0,0 +1,154 @@
+package de.jplag.util;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicReference;
+import java.util.stream.Collectors;
+
+import de.jplag.ParsingException;
+
+import com.ibm.icu.text.CharsetDetector;
+import com.ibm.icu.text.CharsetMatch;
+
+/**
+ * Encapsulates various interactions with files to prevent issues with file encodings.
+ */
+public class FileUtils {
+    private static final Charset DEFAULT_OUTPUT_CHARSET = StandardCharsets.UTF_8;
+    private static final char BYTE_ORDER_MARK = '\uFEFF';
+    private static final int SINGLE_CHAR_BUFFER_SIZE = 10;
+
+    private FileUtils() {
+    }
+
+    /**
+     * Opens a file reader, guessing the charset from the content. Also, if the file is encoded in a UTF* encoding and a bom
+     * exists, it is removed from the reader.
+     * @param file The file to open for read
+     * @return The reader, configured with the best matching charset
+     * @throws IOException If the file does not exist for is not readable
+     */
+    public static BufferedReader openFileReader(File file) throws IOException {
+        InputStream stream = new BufferedInputStream(new FileInputStream(file));
+        Charset charset = detectCharset(stream);
+        BufferedReader reader = new BufferedReader(new FileReader(file, charset));
+        removeBom(reader, charset);
+        return reader;
+    }
+
+    /**
+     * Reads the contents of a file into a single string.
+     * @param file The file to read
+     * @return The files content as a string
+     * @throws IOException If an IO error occurs
+     * @see FileUtils#openFileReader(File)
+     */
+    public static String readFileContent(File file) throws IOException {
+        try (BufferedReader reader = openFileReader(file)) {
+            return reader.lines().collect(Collectors.joining(System.lineSeparator()));
+        }
+    }
+
+    /**
+     * Removes the byte order mark from the beginning of the stream, if it exists and the charset is a UTF* charset. For
+     * details see: <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia</a>
+     * @param reader The reader to remove the bom from
+     * @throws IOException If an IO error occurs.
+     */
+    private static void removeBom(BufferedReader reader, Charset charset) throws IOException {
+        if (charset.name().toUpperCase().startsWith("UTF")) {
+            reader.mark(SINGLE_CHAR_BUFFER_SIZE);
+            if (reader.read() != BYTE_ORDER_MARK) {
+                reader.reset();
+            }
+        }
+    }
+
+    /**
+     * Detects the charset of a file. Prefer using {@link #openFileReader(File)} or {@link #readFileContent(File)} if you
+     * are only interested in the content.
+     * @param file The file to detect
+     * @return The most probable charset
+     * @throws IOException If an IO error occurs
+     */
+    public static Charset detectCharset(File file) throws IOException {
+        try (InputStream stream = new BufferedInputStream(new FileInputStream((file)))) {
+            return detectCharset(stream);
+        }
+    }
+
+    /**
+     * Detects the most probable charset over the whole set of files.
+     * @param files The files to check
+     * @return The most probable charset
+     */
+    public static Charset detectCharsetFromMultiple(Collection<File> files) throws ParsingException {
+        Map<String, List<Integer>> charsetValues = new HashMap<>();
+
+        List<CharsetMatch[]> matchData = new ArrayList<>();
+        for (File file : files) {
+            try (InputStream stream = new BufferedInputStream(new FileInputStream(file))) {
+                matchData.add(detectAllCharsets(stream));
+            } catch (IOException e) {
+                throw new ParsingException(file, e);
+            }
+        }
+
+        for (CharsetMatch[] matches : matchData) {
+            Set<String> remaining = new HashSet<>(Set.of(CharsetDetector.getAllDetectableCharsets()));
+            for (CharsetMatch match : matches) {
+                charsetValues.putIfAbsent(match.getName(), new ArrayList<>());
+                charsetValues.get(match.getName()).add(match.getConfidence());
+                remaining.remove(match.getName());
+            }
+            remaining.forEach(it -> {
+                charsetValues.putIfAbsent(it, new ArrayList<>());
+                charsetValues.get(it).add(0);
+            });
+        }
+
+        AtomicReference<Charset> mostProbable = new AtomicReference<>(StandardCharsets.UTF_8);
+        AtomicReference<Double> mostProbableConfidence = new AtomicReference<>(0.0);
+        charsetValues.forEach((charset, confidenceValues) -> {
+            double average = confidenceValues.stream().mapToInt(it -> it).average().orElse(0);
+            if (confidenceValues.stream().anyMatch(it -> it == 0)) {
+                average = 0;
+            }
+            if (average > mostProbableConfidence.get()) {
+                mostProbable.set(Charset.forName(charset));
+                mostProbableConfidence.set(average);
+            }
+        });
+
+        return mostProbable.get();
+    }
+
+    private static Charset detectCharset(InputStream stream) throws IOException {
+        CharsetDetector charsetDetector = new CharsetDetector();
+
+        charsetDetector.setText(stream);
+
+        CharsetMatch match = charsetDetector.detect();
+        return Charset.forName(match.getName());
+    }
+
+    private static CharsetMatch[] detectAllCharsets(InputStream stream) throws IOException {
+        CharsetDetector charsetDetector = new CharsetDetector();
+
+        charsetDetector.setText(stream);
+
+        return charsetDetector.detectAll();
+    }
+
+    /**
+     * Opens a file writer, using the default charset for JPlag
+     * @param file The file to write
+     * @return The file writer, configured with the default charset
+     * @throws IOException If the file does not exist or is not writable
+     */
+    public static Writer openFileWriter(File file) throws IOException {
+        return new BufferedWriter(new FileWriter(file, DEFAULT_OUTPUT_CHARSET));
+    }
+}
diff --git a/language-api/src/test/java/de/jplag/util/FileUtilTest.java b/language-api/src/test/java/de/jplag/util/FileUtilTest.java
@@ -0,0 +1,48 @@
+package de.jplag.util;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.util.Set;
+
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import de.jplag.ParsingException;
+
+public class FileUtilTest {
+    private static final Path TEST_FILE_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileReaderTests");
+    private static final Path TEST_FILE_SET_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileSetEncoding");
+
+    private static final String expectedFileContent = "Some ascii characters and some others: ä#+öü%&(/)?=?";
+
+    @ParameterizedTest
+    @MethodSource("searchTestFiles")
+    void testReadFile(File file) throws IOException {
+        String found = FileUtils.readFileContent(file);
+
+        Assertions.assertEquals(expectedFileContent, found, "File contains unexpected content: " + file.getAbsolutePath());
+    }
+
+    @ParameterizedTest
+    @MethodSource("searchTestFiles")
+    void testCharsetDetection(File file) throws IOException {
+        Assertions.assertEquals(Charset.forName(file.getName()), FileUtils.detectCharset(file),
+                "Wrong charset assumed for: " + file.getAbsolutePath());
+    }
+
+    @Test
+    void testDetectFromFileSet() throws ParsingException {
+        Set<File> files = Set.of(TEST_FILE_SET_LOCATION.toFile().listFiles());
+        Charset encoding = FileUtils.detectCharsetFromMultiple(files);
+        Assertions.assertEquals(StandardCharsets.ISO_8859_1, encoding);
+    }
+
+    public static File[] searchTestFiles() {
+        return TEST_FILE_LOCATION.toFile().listFiles();
+    }
+}
diff --git a/language-api/src/test/resources/de/jplag/fileReaderTests/ISO-8859-1 b/language-api/src/test/resources/de/jplag/fileReaderTests/ISO-8859-1
@@ -0,0 +1 @@
+Some ascii characters and some others: �#+��%&(/)?=?
diff --git a/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-16LE b/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-16LE
diff --git a/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-32BE b/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-32BE
diff --git a/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-8 b/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-8
@@ -0,0 +1 @@
+Some ascii characters and some others: ä#+öü%&(/)?=?
diff --git a/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii1 b/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii1
@@ -0,0 +1 @@
+some simple ascii characters
diff --git a/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii2 b/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii2
@@ -0,0 +1 @@
+some more ascii characters
diff --git a/language-api/src/test/resources/de/jplag/fileSetEncoding/notAscii b/language-api/src/test/resources/de/jplag/fileSetEncoding/notAscii
@@ -0,0 +1 @@
+this contains a non ascii character: �
diff --git a/languages/cpp2/src/main/java/de/jplag/cpp2/CPPParserAdapter.java b/languages/cpp2/src/main/java/de/jplag/cpp2/CPPParserAdapter.java
@@ -2,7 +2,6 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.file.Files;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
@@ -17,6 +16,7 @@
 import de.jplag.TokenType;
 import de.jplag.cpp2.grammar.CPP14Lexer;
 import de.jplag.cpp2.grammar.CPP14Parser;
+import de.jplag.util.FileUtils;
 
 /**
  * The adapter between {@link AbstractParser} and the ANTLR based parser of this language module.
@@ -37,7 +37,7 @@ public List<Token> scan(Set<File> files) throws ParsingException {
             this.currentFile = file;
             logger.trace("Parsing file {}", currentFile);
             try {
-                CPP14Lexer lexer = new CPP14Lexer(CharStreams.fromStream(Files.newInputStream(file.toPath())));
+                CPP14Lexer lexer = new CPP14Lexer(CharStreams.fromReader(FileUtils.openFileReader(file)));
                 // create a buffer of tokens pulled from the lexer
                 CommonTokenStream tokenStream = new CommonTokenStream(lexer);
                 CPP14Parser parser = new CPP14Parser(tokenStream);

diff --git a/languages/csharp/src/main/java/de/jplag/csharp/CSharpParserAdapter.java b/languages/csharp/src/main/java/de/jplag/csharp/CSharpParserAdapter.java
@@ -1,7 +1,7 @@
 package de.jplag.csharp;
 
+import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
@@ -19,6 +19,7 @@
 import de.jplag.TokenType;
 import de.jplag.csharp.grammar.CSharpLexer;
 import de.jplag.csharp.grammar.CSharpParser;
+import de.jplag.util.FileUtils;
 
 /**
  * Parser adapter for the ANTLR 4 CSharp Parser and Lexer. It receives file to parse and passes them to the ANTLR
@@ -51,11 +52,11 @@ public List<Token> parse(Set<File> files) throws ParsingException {
     }
 
     private void parseFile(File file) throws ParsingException {
-        try (FileInputStream inputStream = new FileInputStream(file)) {
+        try (BufferedReader reader = FileUtils.openFileReader(file)) {
             currentFile = file;
 
             // create a lexer, a parser and a buffer between them.
-            CSharpLexer lexer = new CSharpLexer(CharStreams.fromStream(inputStream));
+            CSharpLexer lexer = new CSharpLexer(CharStreams.fromReader(reader));
             CommonTokenStream tokens = new CommonTokenStream(lexer);
             CSharpParser parser = new CSharpParser(tokens);
 

diff --git a/languages/golang/src/main/java/de/jplag/golang/GoParserAdapter.java b/languages/golang/src/main/java/de/jplag/golang/GoParserAdapter.java
@@ -1,7 +1,7 @@
 package de.jplag.golang;
 
+import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
@@ -19,6 +19,7 @@
 import de.jplag.TokenType;
 import de.jplag.golang.grammar.GoLexer;
 import de.jplag.golang.grammar.GoParser;
+import de.jplag.util.FileUtils;
 
 public class GoParserAdapter extends AbstractParser {
     private File currentFile;
@@ -34,10 +35,10 @@ public List<Token> parse(Set<File> files) throws ParsingException {
     }
 
     private void parseFile(File file) throws ParsingException {
-        try (FileInputStream inputStream = new FileInputStream(file)) {
+        try (BufferedReader reader = FileUtils.openFileReader(file)) {
             currentFile = file;
 
-            GoLexer lexer = new GoLexer(CharStreams.fromStream(inputStream));
+            GoLexer lexer = new GoLexer(CharStreams.fromReader(reader));
             CommonTokenStream tokenStream = new CommonTokenStream(lexer);
             GoParser parser = new GoParser(tokenStream);
 

diff --git a/languages/java/src/main/java/de/jplag/java/JavacAdapter.java b/languages/java/src/main/java/de/jplag/java/JavacAdapter.java
@@ -2,7 +2,7 @@
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.charset.StandardCharsets;
+import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
@@ -20,6 +20,7 @@
 
 import de.jplag.ParsingException;
 import de.jplag.Token;
+import de.jplag.util.FileUtils;
 
 import com.sun.source.tree.CompilationUnitTree;
 import com.sun.source.tree.LineMap;
@@ -35,7 +36,8 @@ public void parseFiles(Set<File> files, final Parser parser) throws ParsingExcep
         var listener = new DiagnosticCollector<>();
 
         List<ParsingException> parsingExceptions = new ArrayList<>();
-        try (final StandardJavaFileManager fileManager = javac.getStandardFileManager(listener, null, StandardCharsets.UTF_8)) {
+        final Charset guessedCharset = FileUtils.detectCharsetFromMultiple(files);
+        try (final StandardJavaFileManager fileManager = javac.getStandardFileManager(listener, null, guessedCharset)) {
             var javaFiles = fileManager.getJavaFileObjectsFromFiles(files);
 
             // We need to disable annotation processing, see

diff --git a/languages/python-3/src/main/java/de/jplag/python3/Parser.java b/languages/python-3/src/main/java/de/jplag/python3/Parser.java
@@ -1,7 +1,7 @@
 package de.jplag.python3;
 
+import java.io.BufferedReader;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
@@ -19,6 +19,7 @@
 import de.jplag.python3.grammar.Python3Lexer;
 import de.jplag.python3.grammar.Python3Parser;
 import de.jplag.python3.grammar.Python3Parser.File_inputContext;
+import de.jplag.util.FileUtils;
 
 public class Parser extends AbstractParser {
 
@@ -43,11 +44,11 @@ public List<Token> parse(Set<File> files) throws ParsingException {
     }
 
     private void parseFile(File file) throws ParsingException {
-        try (FileInputStream fileInputStream = new FileInputStream((file))) {
+        try (BufferedReader reader = FileUtils.openFileReader(file)) {
             currentFile = file;
 
             // create a lexer that feeds off of input CharStream
-            Python3Lexer lexer = new Python3Lexer(CharStreams.fromStream(fileInputStream));
+            Python3Lexer lexer = new Python3Lexer(CharStreams.fromReader(reader));
 
             // create a buffer of tokens pulled from the lexer
             CommonTokenStream tokens = new CommonTokenStream(lexer);
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Some ascii characters and some others: �#+��%&(/)?=?
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Some ascii characters and some others: ä#+öü%&(/)?=?