Skip to content

Commit

Permalink
Merge pull request #1026 from jplag/feature/file-encoding
Browse files Browse the repository at this point in the history
Feature/file encoding
  • Loading branch information
tsaglam authored Apr 26, 2023
2 parents 75193f6 + 67f2cc7 commit ba322d9
Show file tree
Hide file tree
Showing 22 changed files with 258 additions and 30 deletions.
4 changes: 2 additions & 2 deletions core/src/main/java/de/jplag/options/JPlagOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
Expand All @@ -20,6 +19,7 @@
import de.jplag.Language;
import de.jplag.clustering.ClusteringOptions;
import de.jplag.exceptions.BasecodeException;
import de.jplag.util.FileUtils;

/**
* This record defines the options to configure {@link JPlag}.
Expand Down Expand Up @@ -184,7 +184,7 @@ public Integer minimumTokenMatch() {
}

private Set<String> readExclusionFile(final String exclusionFileName) {
try (BufferedReader reader = new BufferedReader(new FileReader(exclusionFileName, JPlagOptions.CHARSET))) {
try (BufferedReader reader = FileUtils.openFileReader(new File(exclusionFileName))) {
final var excludedFileNames = reader.lines().collect(Collectors.toSet());
if (logger.isDebugEnabled()) {
logger.debug("Excluded files:{}{}", System.lineSeparator(), String.join(System.lineSeparator(), excludedFileNames));
Expand Down
5 changes: 5 additions & 0 deletions language-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,10 @@
<groupId>org.kohsuke.metainf-services</groupId>
<artifactId>metainf-services</artifactId>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j-charset</artifactId>
<version>68.1</version>
</dependency>
</dependencies>
</project>
154 changes: 154 additions & 0 deletions language-api/src/main/java/de/jplag/util/FileUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
package de.jplag.util;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import de.jplag.ParsingException;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

/**
* Encapsulates various interactions with files to prevent issues with file encodings.
*/
public class FileUtils {
private static final Charset DEFAULT_OUTPUT_CHARSET = StandardCharsets.UTF_8;
private static final char BYTE_ORDER_MARK = '\uFEFF';
private static final int SINGLE_CHAR_BUFFER_SIZE = 10;

private FileUtils() {
}

/**
* Opens a file reader, guessing the charset from the content. Also, if the file is encoded in a UTF* encoding and a bom
* exists, it is removed from the reader.
* @param file The file to open for read
* @return The reader, configured with the best matching charset
* @throws IOException If the file does not exist for is not readable
*/
public static BufferedReader openFileReader(File file) throws IOException {
InputStream stream = new BufferedInputStream(new FileInputStream(file));
Charset charset = detectCharset(stream);
BufferedReader reader = new BufferedReader(new FileReader(file, charset));
removeBom(reader, charset);
return reader;
}

/**
* Reads the contents of a file into a single string.
* @param file The file to read
* @return The files content as a string
* @throws IOException If an IO error occurs
* @see FileUtils#openFileReader(File)
*/
public static String readFileContent(File file) throws IOException {
try (BufferedReader reader = openFileReader(file)) {
return reader.lines().collect(Collectors.joining(System.lineSeparator()));
}
}

/**
* Removes the byte order mark from the beginning of the stream, if it exists and the charset is a UTF* charset. For
* details see: <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia</a>
* @param reader The reader to remove the bom from
* @throws IOException If an IO error occurs.
*/
private static void removeBom(BufferedReader reader, Charset charset) throws IOException {
if (charset.name().toUpperCase().startsWith("UTF")) {
reader.mark(SINGLE_CHAR_BUFFER_SIZE);
if (reader.read() != BYTE_ORDER_MARK) {
reader.reset();
}
}
}

/**
* Detects the charset of a file. Prefer using {@link #openFileReader(File)} or {@link #readFileContent(File)} if you
* are only interested in the content.
* @param file The file to detect
* @return The most probable charset
* @throws IOException If an IO error occurs
*/
public static Charset detectCharset(File file) throws IOException {
try (InputStream stream = new BufferedInputStream(new FileInputStream((file)))) {
return detectCharset(stream);
}
}

/**
* Detects the most probable charset over the whole set of files.
* @param files The files to check
* @return The most probable charset
*/
public static Charset detectCharsetFromMultiple(Collection<File> files) throws ParsingException {
Map<String, List<Integer>> charsetValues = new HashMap<>();

List<CharsetMatch[]> matchData = new ArrayList<>();
for (File file : files) {
try (InputStream stream = new BufferedInputStream(new FileInputStream(file))) {
matchData.add(detectAllCharsets(stream));
} catch (IOException e) {
throw new ParsingException(file, e);
}
}

for (CharsetMatch[] matches : matchData) {
Set<String> remaining = new HashSet<>(Set.of(CharsetDetector.getAllDetectableCharsets()));
for (CharsetMatch match : matches) {
charsetValues.putIfAbsent(match.getName(), new ArrayList<>());
charsetValues.get(match.getName()).add(match.getConfidence());
remaining.remove(match.getName());
}
remaining.forEach(it -> {
charsetValues.putIfAbsent(it, new ArrayList<>());
charsetValues.get(it).add(0);
});
}

AtomicReference<Charset> mostProbable = new AtomicReference<>(StandardCharsets.UTF_8);
AtomicReference<Double> mostProbableConfidence = new AtomicReference<>(0.0);
charsetValues.forEach((charset, confidenceValues) -> {
double average = confidenceValues.stream().mapToInt(it -> it).average().orElse(0);
if (confidenceValues.stream().anyMatch(it -> it == 0)) {
average = 0;
}
if (average > mostProbableConfidence.get()) {
mostProbable.set(Charset.forName(charset));
mostProbableConfidence.set(average);
}
});

return mostProbable.get();
}

private static Charset detectCharset(InputStream stream) throws IOException {
CharsetDetector charsetDetector = new CharsetDetector();

charsetDetector.setText(stream);

CharsetMatch match = charsetDetector.detect();
return Charset.forName(match.getName());
}

private static CharsetMatch[] detectAllCharsets(InputStream stream) throws IOException {
CharsetDetector charsetDetector = new CharsetDetector();

charsetDetector.setText(stream);

return charsetDetector.detectAll();
}

/**
* Opens a file writer, using the default charset for JPlag
* @param file The file to write
* @return The file writer, configured with the default charset
* @throws IOException If the file does not exist or is not writable
*/
public static Writer openFileWriter(File file) throws IOException {
return new BufferedWriter(new FileWriter(file, DEFAULT_OUTPUT_CHARSET));
}
}
48 changes: 48 additions & 0 deletions language-api/src/test/java/de/jplag/util/FileUtilTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package de.jplag.util;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.Set;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

import de.jplag.ParsingException;

public class FileUtilTest {
private static final Path TEST_FILE_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileReaderTests");
private static final Path TEST_FILE_SET_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileSetEncoding");

private static final String expectedFileContent = "Some ascii characters and some others: ä#+öü%&(/)?=?";

@ParameterizedTest
@MethodSource("searchTestFiles")
void testReadFile(File file) throws IOException {
String found = FileUtils.readFileContent(file);

Assertions.assertEquals(expectedFileContent, found, "File contains unexpected content: " + file.getAbsolutePath());
}

@ParameterizedTest
@MethodSource("searchTestFiles")
void testCharsetDetection(File file) throws IOException {
Assertions.assertEquals(Charset.forName(file.getName()), FileUtils.detectCharset(file),
"Wrong charset assumed for: " + file.getAbsolutePath());
}

@Test
void testDetectFromFileSet() throws ParsingException {
Set<File> files = Set.of(TEST_FILE_SET_LOCATION.toFile().listFiles());
Charset encoding = FileUtils.detectCharsetFromMultiple(files);
Assertions.assertEquals(StandardCharsets.ISO_8859_1, encoding);
}

public static File[] searchTestFiles() {
return TEST_FILE_LOCATION.toFile().listFiles();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some ascii characters and some others: �#+��%&(/)?=?
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some ascii characters and some others: ä#+öü%&(/)?=?
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
some simple ascii characters
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
some more ascii characters
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
this contains a non ascii character: �
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
Expand All @@ -17,6 +16,7 @@
import de.jplag.TokenType;
import de.jplag.cpp2.grammar.CPP14Lexer;
import de.jplag.cpp2.grammar.CPP14Parser;
import de.jplag.util.FileUtils;

/**
* The adapter between {@link AbstractParser} and the ANTLR based parser of this language module.
Expand All @@ -37,7 +37,7 @@ public List<Token> scan(Set<File> files) throws ParsingException {
this.currentFile = file;
logger.trace("Parsing file {}", currentFile);
try {
CPP14Lexer lexer = new CPP14Lexer(CharStreams.fromStream(Files.newInputStream(file.toPath())));
CPP14Lexer lexer = new CPP14Lexer(CharStreams.fromReader(FileUtils.openFileReader(file)));
// create a buffer of tokens pulled from the lexer
CommonTokenStream tokenStream = new CommonTokenStream(lexer);
CPP14Parser parser = new CPP14Parser(tokenStream);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package de.jplag.csharp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -19,6 +19,7 @@
import de.jplag.TokenType;
import de.jplag.csharp.grammar.CSharpLexer;
import de.jplag.csharp.grammar.CSharpParser;
import de.jplag.util.FileUtils;

/**
* Parser adapter for the ANTLR 4 CSharp Parser and Lexer. It receives file to parse and passes them to the ANTLR
Expand Down Expand Up @@ -51,11 +52,11 @@ public List<Token> parse(Set<File> files) throws ParsingException {
}

private void parseFile(File file) throws ParsingException {
try (FileInputStream inputStream = new FileInputStream(file)) {
try (BufferedReader reader = FileUtils.openFileReader(file)) {
currentFile = file;

// create a lexer, a parser and a buffer between them.
CSharpLexer lexer = new CSharpLexer(CharStreams.fromStream(inputStream));
CSharpLexer lexer = new CSharpLexer(CharStreams.fromReader(reader));
CommonTokenStream tokens = new CommonTokenStream(lexer);
CSharpParser parser = new CSharpParser(tokens);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package de.jplag.golang;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -19,6 +19,7 @@
import de.jplag.TokenType;
import de.jplag.golang.grammar.GoLexer;
import de.jplag.golang.grammar.GoParser;
import de.jplag.util.FileUtils;

public class GoParserAdapter extends AbstractParser {
private File currentFile;
Expand All @@ -34,10 +35,10 @@ public List<Token> parse(Set<File> files) throws ParsingException {
}

private void parseFile(File file) throws ParsingException {
try (FileInputStream inputStream = new FileInputStream(file)) {
try (BufferedReader reader = FileUtils.openFileReader(file)) {
currentFile = file;

GoLexer lexer = new GoLexer(CharStreams.fromStream(inputStream));
GoLexer lexer = new GoLexer(CharStreams.fromReader(reader));
CommonTokenStream tokenStream = new CommonTokenStream(lexer);
GoParser parser = new GoParser(tokenStream);

Expand Down
6 changes: 4 additions & 2 deletions languages/java/src/main/java/de/jplag/java/JavacAdapter.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
Expand All @@ -20,6 +20,7 @@

import de.jplag.ParsingException;
import de.jplag.Token;
import de.jplag.util.FileUtils;

import com.sun.source.tree.CompilationUnitTree;
import com.sun.source.tree.LineMap;
Expand All @@ -35,7 +36,8 @@ public void parseFiles(Set<File> files, final Parser parser) throws ParsingExcep
var listener = new DiagnosticCollector<>();

List<ParsingException> parsingExceptions = new ArrayList<>();
try (final StandardJavaFileManager fileManager = javac.getStandardFileManager(listener, null, StandardCharsets.UTF_8)) {
final Charset guessedCharset = FileUtils.detectCharsetFromMultiple(files);
try (final StandardJavaFileManager fileManager = javac.getStandardFileManager(listener, null, guessedCharset)) {
var javaFiles = fileManager.getJavaFileObjectsFromFiles(files);

// We need to disable annotation processing, see
Expand Down
7 changes: 4 additions & 3 deletions languages/python-3/src/main/java/de/jplag/python3/Parser.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package de.jplag.python3;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -19,6 +19,7 @@
import de.jplag.python3.grammar.Python3Lexer;
import de.jplag.python3.grammar.Python3Parser;
import de.jplag.python3.grammar.Python3Parser.File_inputContext;
import de.jplag.util.FileUtils;

public class Parser extends AbstractParser {

Expand All @@ -43,11 +44,11 @@ public List<Token> parse(Set<File> files) throws ParsingException {
}

private void parseFile(File file) throws ParsingException {
try (FileInputStream fileInputStream = new FileInputStream((file))) {
try (BufferedReader reader = FileUtils.openFileReader(file)) {
currentFile = file;

// create a lexer that feeds off of input CharStream
Python3Lexer lexer = new Python3Lexer(CharStreams.fromStream(fileInputStream));
Python3Lexer lexer = new Python3Lexer(CharStreams.fromReader(reader));

// create a buffer of tokens pulled from the lexer
CommonTokenStream tokens = new CommonTokenStream(lexer);
Expand Down
Loading

0 comments on commit ba322d9

Please sign in to comment.