Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/file encoding #1026

Merged
merged 13 commits into from
Apr 26, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions core/src/main/java/de/jplag/options/JPlagOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
Expand All @@ -20,6 +19,7 @@
import de.jplag.Language;
import de.jplag.clustering.ClusteringOptions;
import de.jplag.exceptions.BasecodeException;
import de.jplag.util.FileUtils;

/**
* This record defines the options to configure {@link JPlag}.
Expand Down Expand Up @@ -184,7 +184,7 @@ public Integer minimumTokenMatch() {
}

private Set<String> readExclusionFile(final String exclusionFileName) {
try (BufferedReader reader = new BufferedReader(new FileReader(exclusionFileName, JPlagOptions.CHARSET))) {
try (BufferedReader reader = FileUtils.openFileReader(new File(exclusionFileName))) {
final var excludedFileNames = reader.lines().collect(Collectors.toSet());
if (logger.isDebugEnabled()) {
logger.debug("Excluded files:{}{}", System.lineSeparator(), String.join(System.lineSeparator(), excludedFileNames));
Expand Down
5 changes: 5 additions & 0 deletions language-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,10 @@
<groupId>org.kohsuke.metainf-services</groupId>
<artifactId>metainf-services</artifactId>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j-charset</artifactId>
<version>68.1</version>
</dependency>
</dependencies>
</project>
141 changes: 141 additions & 0 deletions language-api/src/main/java/de/jplag/util/FileUtils.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package de.jplag.util;

import java.io.*;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.concurrent.atomic.AtomicReference;
import java.util.stream.Collectors;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;

/**
* Encapsulates various interactions with files to prevent issues with file encodings.
*/
public class FileUtils {
private static final Charset defaultOutputCharset = StandardCharsets.UTF_8;
TwoOfTwelve marked this conversation as resolved.
Show resolved Hide resolved

/**
* Opens a file reader, guessing the charset from the content. Also, if the file is encoded in a UTF* encoding and a bom
* exists, it is removed from the reader.
* @param file The file to open for read
* @return The reader, configured with the best matching charset
* @throws IOException If the file does not exist for is not readable
*/
public static BufferedReader openFileReader(File file) throws IOException {
InputStream stream = new BufferedInputStream(new FileInputStream(file));
Charset charset = detectCharset(stream);
BufferedReader reader = new BufferedReader(new FileReader(file, charset));
removeBom(reader, charset);
return reader;
}

/**
* Reads the contents of a file into a single string.
* @param file The file to read
* @return The files content as a string
* @throws IOException If an IO error occurs
* @see FileUtils#openFileReader(File)
*/
public static String readFileContent(File file) throws IOException {
try (BufferedReader reader = openFileReader(file)) {
return reader.lines().collect(Collectors.joining(System.lineSeparator()));
}
}

/**
* Removes the byte order mark from the beginning of the stream, if it exists and the charset is a UTF* charset. For
* details see: <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia</a>
* @param reader The reader to remove the bom from
* @throws IOException If an IO error occurs.
*/
private static void removeBom(BufferedReader reader, Charset charset) throws IOException {
if (charset.name().toUpperCase().startsWith("UTF")) {
reader.mark(10);
if (reader.read() != '\uFEFF') {
TwoOfTwelve marked this conversation as resolved.
Show resolved Hide resolved
reader.reset();
}
}
}

/**
* Detects the charset of a file. Prefer using {@link #openFileReader(File)} or {@link #readFileContent(File)} if you
* are only interested in the content.
* @param file The file to detect
* @return The most probable charset
* @throws IOException If an IO error occurs
*/
public static Charset detectCharset(File file) throws IOException {
try (InputStream stream = new BufferedInputStream(new FileInputStream((file)))) {
return detectCharset(stream);
}
}

/**
* Detects the most probable charset over the whole set of files.
* @param files The files to check
* @return The most probable charset
*/
public static Charset detectCharsetFromMultiple(Collection<File> files) {
Map<String, List<Integer>> charsetValues = new HashMap<>();

files.stream().map(it -> {
try (InputStream stream = new BufferedInputStream(new FileInputStream(it))) {
return detectAllCharsets(stream);
} catch (IOException e) {
throw new RuntimeException(e);
}
}).forEach(matches -> {
Set<String> remaining = new HashSet<>(Set.of(CharsetDetector.getAllDetectableCharsets()));
for (CharsetMatch match : matches) {
charsetValues.putIfAbsent(match.getName(), new ArrayList<>());
charsetValues.get(match.getName()).add(match.getConfidence());
remaining.remove(match.getName());
}
remaining.forEach(it -> {
charsetValues.putIfAbsent(it, new ArrayList<>());
charsetValues.get(it).add(0);
});
});
TwoOfTwelve marked this conversation as resolved.
Show resolved Hide resolved

AtomicReference<Charset> mostProbable = new AtomicReference<>(StandardCharsets.UTF_8);
AtomicReference<Double> mostProbableConfidence = new AtomicReference<>((double) 0);
TwoOfTwelve marked this conversation as resolved.
Show resolved Hide resolved
charsetValues.forEach((charset, confidenceValues) -> {
double average = confidenceValues.stream().mapToInt(it -> it).average().orElse(0);
if (average > mostProbableConfidence.get()) {
mostProbable.set(Charset.forName(charset));
mostProbableConfidence.set(average);
}
});

return mostProbable.get();
}

private static Charset detectCharset(InputStream stream) throws IOException {
CharsetDetector charsetDetector = new CharsetDetector();

charsetDetector.setText(stream);

CharsetMatch match = charsetDetector.detect();
return Charset.forName(match.getName());
}

private static CharsetMatch[] detectAllCharsets(InputStream stream) throws IOException {
CharsetDetector charsetDetector = new CharsetDetector();

charsetDetector.setText(stream);

return charsetDetector.detectAll();
}

/**
* Opens a file writer, using the default charset for JPlag
* @param file The file to write
* @return The file writer, configured with the default charset
* @throws IOException If the file does not exist or is not writable
*/
public static Writer openFileWriter(File file) throws IOException {
return new BufferedWriter(new FileWriter(file, defaultOutputCharset));
}
}
46 changes: 46 additions & 0 deletions language-api/src/test/java/de/jplag/util/FileUtilTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package de.jplag.util;

import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.Set;

import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

public class FileUtilTest {
private static final Path TEST_FILE_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileReaderTests");
private static final Path TEST_FILE_SET_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileSetEncoding");

private static final String expectedFileContent = "Some ascii characters and some others: ä#+öü%&(/)?=?";

@ParameterizedTest
@MethodSource("searchTestFiles")
public void testReadFile(File file) throws IOException {
String found = FileUtils.readFileContent(file);

Assertions.assertEquals(expectedFileContent, found, "File contains unexpected content: " + file.getAbsolutePath());
}

@ParameterizedTest
@MethodSource("searchTestFiles")
public void testCharsetDetection(File file) throws IOException {
Assertions.assertEquals(Charset.forName(file.getName()), FileUtils.detectCharset(file),
"Wrong charset assumed for: " + file.getAbsolutePath());
}

@Test
public void testDetectFromFileSet() {
Set<File> files = Set.of(TEST_FILE_SET_LOCATION.toFile().listFiles());
Charset encoding = FileUtils.detectCharsetFromMultiple(files);
Assertions.assertEquals(StandardCharsets.ISO_8859_1, encoding);
}

public static File[] searchTestFiles() {
return TEST_FILE_LOCATION.toFile().listFiles();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some ascii characters and some others: �#+��%&(/)?=?
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some ascii characters and some others: ä#+öü%&(/)?=?
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
some simple ascii characters
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
some more ascii characters
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
this contains a non ascii character: �
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
Expand All @@ -17,6 +16,7 @@
import de.jplag.TokenType;
import de.jplag.cpp2.grammar.CPP14Lexer;
import de.jplag.cpp2.grammar.CPP14Parser;
import de.jplag.util.FileUtils;

/**
* The adapter between {@link AbstractParser} and the ANTLR based parser of this language module.
Expand All @@ -37,7 +37,7 @@ public List<Token> scan(Set<File> files) throws ParsingException {
this.currentFile = file;
logger.trace("Parsing file {}", currentFile);
try {
CPP14Lexer lexer = new CPP14Lexer(CharStreams.fromStream(Files.newInputStream(file.toPath())));
CPP14Lexer lexer = new CPP14Lexer(CharStreams.fromReader(FileUtils.openFileReader(file)));
// create a buffer of tokens pulled from the lexer
CommonTokenStream tokenStream = new CommonTokenStream(lexer);
CPP14Parser parser = new CPP14Parser(tokenStream);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package de.jplag.csharp;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -19,6 +19,7 @@
import de.jplag.TokenType;
import de.jplag.csharp.grammar.CSharpLexer;
import de.jplag.csharp.grammar.CSharpParser;
import de.jplag.util.FileUtils;

/**
* Parser adapter for the ANTLR 4 CSharp Parser and Lexer. It receives file to parse and passes them to the ANTLR
Expand Down Expand Up @@ -51,11 +52,11 @@ public List<Token> parse(Set<File> files) throws ParsingException {
}

private void parseFile(File file) throws ParsingException {
try (FileInputStream inputStream = new FileInputStream(file)) {
try (BufferedReader reader = FileUtils.openFileReader(file)) {
currentFile = file;

// create a lexer, a parser and a buffer between them.
CSharpLexer lexer = new CSharpLexer(CharStreams.fromStream(inputStream));
CSharpLexer lexer = new CSharpLexer(CharStreams.fromReader(reader));
CommonTokenStream tokens = new CommonTokenStream(lexer);
CSharpParser parser = new CSharpParser(tokens);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package de.jplag.golang;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -19,6 +19,7 @@
import de.jplag.TokenType;
import de.jplag.golang.grammar.GoLexer;
import de.jplag.golang.grammar.GoParser;
import de.jplag.util.FileUtils;

public class GoParserAdapter extends AbstractParser {
private File currentFile;
Expand All @@ -34,10 +35,10 @@ public List<Token> parse(Set<File> files) throws ParsingException {
}

private void parseFile(File file) throws ParsingException {
try (FileInputStream inputStream = new FileInputStream(file)) {
try (BufferedReader reader = FileUtils.openFileReader(file)) {
currentFile = file;

GoLexer lexer = new GoLexer(CharStreams.fromStream(inputStream));
GoLexer lexer = new GoLexer(CharStreams.fromReader(reader));
CommonTokenStream tokenStream = new CommonTokenStream(lexer);
GoParser parser = new GoParser(tokenStream);

Expand Down
6 changes: 4 additions & 2 deletions languages/java/src/main/java/de/jplag/java/JavacAdapter.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
Expand All @@ -20,6 +20,7 @@

import de.jplag.ParsingException;
import de.jplag.Token;
import de.jplag.util.FileUtils;

import com.sun.source.tree.CompilationUnitTree;
import com.sun.source.tree.LineMap;
Expand All @@ -35,7 +36,8 @@ public void parseFiles(Set<File> files, final Parser parser) throws ParsingExcep
var listener = new DiagnosticCollector<>();

List<ParsingException> parsingExceptions = new ArrayList<>();
try (final StandardJavaFileManager fileManager = javac.getStandardFileManager(listener, null, StandardCharsets.UTF_8)) {
final Charset guessedCharset = FileUtils.detectCharsetFromMultiple(files);
try (final StandardJavaFileManager fileManager = javac.getStandardFileManager(listener, null, guessedCharset)) {
var javaFiles = fileManager.getJavaFileObjectsFromFiles(files);

// We need to disable annotation processing, see
Expand Down
7 changes: 4 additions & 3 deletions languages/python-3/src/main/java/de/jplag/python3/Parser.java
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
package de.jplag.python3;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -19,6 +19,7 @@
import de.jplag.python3.grammar.Python3Lexer;
import de.jplag.python3.grammar.Python3Parser;
import de.jplag.python3.grammar.Python3Parser.File_inputContext;
import de.jplag.util.FileUtils;

public class Parser extends AbstractParser {

Expand All @@ -43,11 +44,11 @@ public List<Token> parse(Set<File> files) throws ParsingException {
}

private void parseFile(File file) throws ParsingException {
try (FileInputStream fileInputStream = new FileInputStream((file))) {
try (BufferedReader reader = FileUtils.openFileReader(file)) {
currentFile = file;

// create a lexer that feeds off of input CharStream
Python3Lexer lexer = new Python3Lexer(CharStreams.fromStream(fileInputStream));
Python3Lexer lexer = new Python3Lexer(CharStreams.fromReader(reader));

// create a buffer of tokens pulled from the lexer
CommonTokenStream tokens = new CommonTokenStream(lexer);
Expand Down
Loading