-
Notifications
You must be signed in to change notification settings - Fork 332
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1026 from jplag/feature/file-encoding
Feature/file encoding
- Loading branch information
Showing
22 changed files
with
258 additions
and
30 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
154 changes: 154 additions & 0 deletions
154
language-api/src/main/java/de/jplag/util/FileUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,154 @@ | ||
package de.jplag.util; | ||
|
||
import java.io.*; | ||
import java.nio.charset.Charset; | ||
import java.nio.charset.StandardCharsets; | ||
import java.util.*; | ||
import java.util.concurrent.atomic.AtomicReference; | ||
import java.util.stream.Collectors; | ||
|
||
import de.jplag.ParsingException; | ||
|
||
import com.ibm.icu.text.CharsetDetector; | ||
import com.ibm.icu.text.CharsetMatch; | ||
|
||
/** | ||
* Encapsulates various interactions with files to prevent issues with file encodings. | ||
*/ | ||
public class FileUtils { | ||
private static final Charset DEFAULT_OUTPUT_CHARSET = StandardCharsets.UTF_8; | ||
private static final char BYTE_ORDER_MARK = '\uFEFF'; | ||
private static final int SINGLE_CHAR_BUFFER_SIZE = 10; | ||
|
||
private FileUtils() { | ||
} | ||
|
||
/** | ||
* Opens a file reader, guessing the charset from the content. Also, if the file is encoded in a UTF* encoding and a bom | ||
* exists, it is removed from the reader. | ||
* @param file The file to open for read | ||
* @return The reader, configured with the best matching charset | ||
* @throws IOException If the file does not exist for is not readable | ||
*/ | ||
public static BufferedReader openFileReader(File file) throws IOException { | ||
InputStream stream = new BufferedInputStream(new FileInputStream(file)); | ||
Charset charset = detectCharset(stream); | ||
BufferedReader reader = new BufferedReader(new FileReader(file, charset)); | ||
removeBom(reader, charset); | ||
return reader; | ||
} | ||
|
||
/** | ||
* Reads the contents of a file into a single string. | ||
* @param file The file to read | ||
* @return The files content as a string | ||
* @throws IOException If an IO error occurs | ||
* @see FileUtils#openFileReader(File) | ||
*/ | ||
public static String readFileContent(File file) throws IOException { | ||
try (BufferedReader reader = openFileReader(file)) { | ||
return reader.lines().collect(Collectors.joining(System.lineSeparator())); | ||
} | ||
} | ||
|
||
/** | ||
* Removes the byte order mark from the beginning of the stream, if it exists and the charset is a UTF* charset. For | ||
* details see: <a href="https://en.wikipedia.org/wiki/Byte_order_mark">Wikipedia</a> | ||
* @param reader The reader to remove the bom from | ||
* @throws IOException If an IO error occurs. | ||
*/ | ||
private static void removeBom(BufferedReader reader, Charset charset) throws IOException { | ||
if (charset.name().toUpperCase().startsWith("UTF")) { | ||
reader.mark(SINGLE_CHAR_BUFFER_SIZE); | ||
if (reader.read() != BYTE_ORDER_MARK) { | ||
reader.reset(); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Detects the charset of a file. Prefer using {@link #openFileReader(File)} or {@link #readFileContent(File)} if you | ||
* are only interested in the content. | ||
* @param file The file to detect | ||
* @return The most probable charset | ||
* @throws IOException If an IO error occurs | ||
*/ | ||
public static Charset detectCharset(File file) throws IOException { | ||
try (InputStream stream = new BufferedInputStream(new FileInputStream((file)))) { | ||
return detectCharset(stream); | ||
} | ||
} | ||
|
||
/** | ||
* Detects the most probable charset over the whole set of files. | ||
* @param files The files to check | ||
* @return The most probable charset | ||
*/ | ||
public static Charset detectCharsetFromMultiple(Collection<File> files) throws ParsingException { | ||
Map<String, List<Integer>> charsetValues = new HashMap<>(); | ||
|
||
List<CharsetMatch[]> matchData = new ArrayList<>(); | ||
for (File file : files) { | ||
try (InputStream stream = new BufferedInputStream(new FileInputStream(file))) { | ||
matchData.add(detectAllCharsets(stream)); | ||
} catch (IOException e) { | ||
throw new ParsingException(file, e); | ||
} | ||
} | ||
|
||
for (CharsetMatch[] matches : matchData) { | ||
Set<String> remaining = new HashSet<>(Set.of(CharsetDetector.getAllDetectableCharsets())); | ||
for (CharsetMatch match : matches) { | ||
charsetValues.putIfAbsent(match.getName(), new ArrayList<>()); | ||
charsetValues.get(match.getName()).add(match.getConfidence()); | ||
remaining.remove(match.getName()); | ||
} | ||
remaining.forEach(it -> { | ||
charsetValues.putIfAbsent(it, new ArrayList<>()); | ||
charsetValues.get(it).add(0); | ||
}); | ||
} | ||
|
||
AtomicReference<Charset> mostProbable = new AtomicReference<>(StandardCharsets.UTF_8); | ||
AtomicReference<Double> mostProbableConfidence = new AtomicReference<>(0.0); | ||
charsetValues.forEach((charset, confidenceValues) -> { | ||
double average = confidenceValues.stream().mapToInt(it -> it).average().orElse(0); | ||
if (confidenceValues.stream().anyMatch(it -> it == 0)) { | ||
average = 0; | ||
} | ||
if (average > mostProbableConfidence.get()) { | ||
mostProbable.set(Charset.forName(charset)); | ||
mostProbableConfidence.set(average); | ||
} | ||
}); | ||
|
||
return mostProbable.get(); | ||
} | ||
|
||
private static Charset detectCharset(InputStream stream) throws IOException { | ||
CharsetDetector charsetDetector = new CharsetDetector(); | ||
|
||
charsetDetector.setText(stream); | ||
|
||
CharsetMatch match = charsetDetector.detect(); | ||
return Charset.forName(match.getName()); | ||
} | ||
|
||
private static CharsetMatch[] detectAllCharsets(InputStream stream) throws IOException { | ||
CharsetDetector charsetDetector = new CharsetDetector(); | ||
|
||
charsetDetector.setText(stream); | ||
|
||
return charsetDetector.detectAll(); | ||
} | ||
|
||
/** | ||
* Opens a file writer, using the default charset for JPlag | ||
* @param file The file to write | ||
* @return The file writer, configured with the default charset | ||
* @throws IOException If the file does not exist or is not writable | ||
*/ | ||
public static Writer openFileWriter(File file) throws IOException { | ||
return new BufferedWriter(new FileWriter(file, DEFAULT_OUTPUT_CHARSET)); | ||
} | ||
} |
48 changes: 48 additions & 0 deletions
48
language-api/src/test/java/de/jplag/util/FileUtilTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
package de.jplag.util; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.nio.charset.Charset; | ||
import java.nio.charset.StandardCharsets; | ||
import java.nio.file.Path; | ||
import java.util.Set; | ||
|
||
import org.junit.jupiter.api.Assertions; | ||
import org.junit.jupiter.api.Test; | ||
import org.junit.jupiter.params.ParameterizedTest; | ||
import org.junit.jupiter.params.provider.MethodSource; | ||
|
||
import de.jplag.ParsingException; | ||
|
||
public class FileUtilTest { | ||
private static final Path TEST_FILE_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileReaderTests"); | ||
private static final Path TEST_FILE_SET_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileSetEncoding"); | ||
|
||
private static final String expectedFileContent = "Some ascii characters and some others: ä#+öü%&(/)?=?"; | ||
|
||
@ParameterizedTest | ||
@MethodSource("searchTestFiles") | ||
void testReadFile(File file) throws IOException { | ||
String found = FileUtils.readFileContent(file); | ||
|
||
Assertions.assertEquals(expectedFileContent, found, "File contains unexpected content: " + file.getAbsolutePath()); | ||
} | ||
|
||
@ParameterizedTest | ||
@MethodSource("searchTestFiles") | ||
void testCharsetDetection(File file) throws IOException { | ||
Assertions.assertEquals(Charset.forName(file.getName()), FileUtils.detectCharset(file), | ||
"Wrong charset assumed for: " + file.getAbsolutePath()); | ||
} | ||
|
||
@Test | ||
void testDetectFromFileSet() throws ParsingException { | ||
Set<File> files = Set.of(TEST_FILE_SET_LOCATION.toFile().listFiles()); | ||
Charset encoding = FileUtils.detectCharsetFromMultiple(files); | ||
Assertions.assertEquals(StandardCharsets.ISO_8859_1, encoding); | ||
} | ||
|
||
public static File[] searchTestFiles() { | ||
return TEST_FILE_LOCATION.toFile().listFiles(); | ||
} | ||
} |
1 change: 1 addition & 0 deletions
1
language-api/src/test/resources/de/jplag/fileReaderTests/ISO-8859-1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Some ascii characters and some others: �#+��%&(/)?=? |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Some ascii characters and some others: ä#+öü%&(/)?=? |
1 change: 1 addition & 0 deletions
1
language-api/src/test/resources/de/jplag/fileSetEncoding/ascii1
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
some simple ascii characters |
1 change: 1 addition & 0 deletions
1
language-api/src/test/resources/de/jplag/fileSetEncoding/ascii2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
some more ascii characters |
1 change: 1 addition & 0 deletions
1
language-api/src/test/resources/de/jplag/fileSetEncoding/notAscii
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
this contains a non ascii character: � |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.