From 44a177ab4f3de4bb8ea076fc648d070a44068dce Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Tue, 11 Apr 2023 13:37:42 +0200 Subject: [PATCH 01/11] Basic implementation for file encoding detection --- .../java/de/jplag/options/JPlagOptions.java | 4 +- language-api/pom.xml | 5 + .../main/java/de/jplag/util/FileUtils.java | 91 ++++++++++++++++++ .../test/java/de/jplag/util/FileUtilTest.java | 35 +++++++ .../de/jplag/fileReaderTests/ISO-8859-1 | 1 + .../de/jplag/fileReaderTests/UTF-16LE | Bin 0 -> 106 bytes .../de/jplag/fileReaderTests/UTF-32BE | Bin 0 -> 208 bytes .../resources/de/jplag/fileReaderTests/UTF-8 | 1 + 8 files changed, 135 insertions(+), 2 deletions(-) create mode 100644 language-api/src/main/java/de/jplag/util/FileUtils.java create mode 100644 language-api/src/test/java/de/jplag/util/FileUtilTest.java create mode 100644 language-api/src/test/resources/de/jplag/fileReaderTests/ISO-8859-1 create mode 100644 language-api/src/test/resources/de/jplag/fileReaderTests/UTF-16LE create mode 100644 language-api/src/test/resources/de/jplag/fileReaderTests/UTF-32BE create mode 100644 language-api/src/test/resources/de/jplag/fileReaderTests/UTF-8 diff --git a/core/src/main/java/de/jplag/options/JPlagOptions.java b/core/src/main/java/de/jplag/options/JPlagOptions.java index e392fefdf..87da296c8 100644 --- a/core/src/main/java/de/jplag/options/JPlagOptions.java +++ b/core/src/main/java/de/jplag/options/JPlagOptions.java @@ -2,7 +2,6 @@ import java.io.BufferedReader; import java.io.File; -import java.io.FileReader; import java.io.IOException; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; @@ -20,6 +19,7 @@ import de.jplag.Language; import de.jplag.clustering.ClusteringOptions; import de.jplag.exceptions.BasecodeException; +import de.jplag.util.FileUtils; /** * This record defines the options to configure {@link JPlag}. @@ -184,7 +184,7 @@ public Integer minimumTokenMatch() { } private Set readExclusionFile(final String exclusionFileName) { - try (BufferedReader reader = new BufferedReader(new FileReader(exclusionFileName, JPlagOptions.CHARSET))) { + try (BufferedReader reader = FileUtils.openFileReader(new File(exclusionFileName))) { final var excludedFileNames = reader.lines().collect(Collectors.toSet()); if (logger.isDebugEnabled()) { logger.debug("Excluded files:{}{}", System.lineSeparator(), String.join(System.lineSeparator(), excludedFileNames)); diff --git a/language-api/pom.xml b/language-api/pom.xml index 2476d5e21..623efc192 100644 --- a/language-api/pom.xml +++ b/language-api/pom.xml @@ -15,5 +15,10 @@ org.kohsuke.metainf-services metainf-services + + com.ibm.icu + icu4j-charset + 68.1 + diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java new file mode 100644 index 000000000..6f0748b6b --- /dev/null +++ b/language-api/src/main/java/de/jplag/util/FileUtils.java @@ -0,0 +1,91 @@ +package de.jplag.util; + +import java.io.*; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.stream.Collectors; + +import com.ibm.icu.text.CharsetDetector; +import com.ibm.icu.text.CharsetMatch; + +/** + * Encapsulates various interactions with files to prevent issues with file encodings. + */ +public class FileUtils { + private static final Charset defaultOutputCharset = StandardCharsets.UTF_8; + + /** + * Opens a file reader, guessing the charset from the content. Also, if the file is encoded in a UTF* encoding and a bom + * exists, it is removed from the reader. + * @param file The file to open for read + * @return The reader, configured with the best matching charset + * @throws IOException If the file does not exist for is not readable + */ + public static BufferedReader openFileReader(File file) throws IOException { + InputStream stream = new BufferedInputStream(new FileInputStream(file)); + Charset charset = detectCharset(stream); + BufferedReader reader = new BufferedReader(new FileReader(file, charset)); + removeBom(reader, charset); + return reader; + } + + /** + * Reads the contents of a file into a single string. + * @param file The file to read + * @return The files content as a string + * @throws IOException If an IO error occurs + * @see FileUtils#openFileReader(File) + */ + public static String readFileContent(File file) throws IOException { + try (BufferedReader reader = openFileReader(file)) { + return reader.lines().collect(Collectors.joining(System.lineSeparator())); + } + } + + /** + * Removes the byte order mark from the beginning of the stream, if it exists and the charset is a UTF* charset. For + * details see: Wikipedia + * @param reader The reader to remove the bom from + * @throws IOException If an IO error occurs. + */ + private static void removeBom(BufferedReader reader, Charset charset) throws IOException { + if (charset.name().toUpperCase().startsWith("UTF")) { + reader.mark(10); + if (reader.read() != '\uFEFF') { + reader.reset(); + } + } + } + + /** + * Detects the charset of a file. Prefer using {@link #openFileReader(File)} or {@link #readFileContent(File)} if you + * are only interested in the content. + * @param file The file to detect + * @return The most probable charset + * @throws IOException If an IO error occurs + */ + public static Charset detectCharset(File file) throws IOException { + try (InputStream stream = new BufferedInputStream(new FileInputStream((file)))) { + return detectCharset(stream); + } + } + + private static Charset detectCharset(InputStream stream) throws IOException { + CharsetDetector charsetDetector = new CharsetDetector(); + + charsetDetector.setText(stream); + + CharsetMatch match = charsetDetector.detect(); + return Charset.forName(match.getName()); + } + + /** + * Opens a file writer, using the default charset for JPlag + * @param file The file to write + * @return The file writer, configured with the default charset + * @throws IOException If the file does not exist or is not writable + */ + public static Writer openFileWriter(File file) throws IOException { + return new BufferedWriter(new FileWriter(file, defaultOutputCharset)); + } +} diff --git a/language-api/src/test/java/de/jplag/util/FileUtilTest.java b/language-api/src/test/java/de/jplag/util/FileUtilTest.java new file mode 100644 index 000000000..52ab58e6a --- /dev/null +++ b/language-api/src/test/java/de/jplag/util/FileUtilTest.java @@ -0,0 +1,35 @@ +package de.jplag.util; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Path; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.MethodSource; + +public class FileUtilTest { + private static final Path TEST_FILE_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileReaderTests"); + + private static final String expectedFileContent = "Some ascii characters and some others: ä#+öü%&(/)?=?"; + + @ParameterizedTest + @MethodSource("searchTestFiles") + public void testReadFile(File file) throws IOException { + String found = FileUtils.readFileContent(file); + + Assertions.assertEquals(expectedFileContent, found, "File contains unexpected content: " + file.getAbsolutePath()); + } + + @ParameterizedTest + @MethodSource("searchTestFiles") + public void testCharsetDetection(File file) throws IOException { + Assertions.assertEquals(Charset.forName(file.getName()), FileUtils.detectCharset(file), + "Wrong charset assumed for: " + file.getAbsolutePath()); + } + + public static File[] searchTestFiles() { + return TEST_FILE_LOCATION.toFile().listFiles(); + } +} diff --git a/language-api/src/test/resources/de/jplag/fileReaderTests/ISO-8859-1 b/language-api/src/test/resources/de/jplag/fileReaderTests/ISO-8859-1 new file mode 100644 index 000000000..292674145 --- /dev/null +++ b/language-api/src/test/resources/de/jplag/fileReaderTests/ISO-8859-1 @@ -0,0 +1 @@ +Some ascii characters and some others: �#+��%&(/)?=? \ No newline at end of file diff --git a/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-16LE b/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-16LE new file mode 100644 index 0000000000000000000000000000000000000000..642cfe74c2f26dbb6d465c9f38d3f6fd48cc4acc GIT binary patch literal 106 zcmezWFPI^pA(tVQL4hHWp_n0=ArlA{fOH0sUj)R-3?)F>BA^^dMIJ*6kS>Pn$p^}2 iK;^A~GEW$k8MGO`G5ldrWl&?#V9;mKWUyzj1!4fvxD?v} literal 0 HcmV?d00001 diff --git a/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-32BE b/language-api/src/test/resources/de/jplag/fileReaderTests/UTF-32BE new file mode 100644 index 0000000000000000000000000000000000000000..f3cd324242dc40132ca74ac4943fd8c1c4576489 GIT binary patch literal 208 zcmZ9_K?*=n5XSL2%9MyK?3EnC3FIjjC>+@ttStP$dJ9?nnr7a7^USO>vkePIIC>ly yFh?HyiK_QKR_&ZnRx{-s+F23c(ER_8dS!L Date: Wed, 12 Apr 2023 12:24:36 +0200 Subject: [PATCH 02/11] Using FileUtils for file accesses. --- .../main/java/de/jplag/util/FileUtils.java | 50 +++++++++++++++++++ .../java/de/jplag/cpp2/CPPParserAdapter.java | 4 +- .../de/jplag/csharp/CSharpParserAdapter.java | 7 +-- .../java/de/jplag/golang/GoParserAdapter.java | 7 +-- .../main/java/de/jplag/java/JavacAdapter.java | 6 ++- .../main/java/de/jplag/python3/Parser.java | 7 +-- .../java/de/jplag/rlang/RParserAdapter.java | 7 +-- .../java/de/jplag/rust/RustParserAdapter.java | 7 +-- .../main/scala/de/jplag/scala/Parser.scala | 5 +- languages/scheme/src/main/javacc/Scheme.jj | 8 ++- .../de/jplag/swift/SwiftParserAdapter.java | 7 +-- .../java/de/jplag/text/ParserAdapter.java | 4 +- 12 files changed, 91 insertions(+), 28 deletions(-) diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java index 6f0748b6b..63f4f282c 100644 --- a/language-api/src/main/java/de/jplag/util/FileUtils.java +++ b/language-api/src/main/java/de/jplag/util/FileUtils.java @@ -3,6 +3,8 @@ import java.io.*; import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; import com.ibm.icu.text.CharsetDetector; @@ -70,6 +72,46 @@ public static Charset detectCharset(File file) throws IOException { } } + /** + * Detects the most probable charset over the whole set of files. + * @param files The files to check + * @return The most probable charset + */ + public static Charset detectCharsetFromMultiple(Collection files) { + Map> charsetValues = new HashMap<>(); + + files.stream().map(it -> { + try (InputStream stream = new BufferedInputStream(new FileInputStream(it))) { + return detectAllCharsets(stream); + } catch (IOException e) { + throw new RuntimeException(e); + } + }).forEach(matches -> { + Set remaining = new HashSet<>(Set.of(CharsetDetector.getAllDetectableCharsets())); + for (CharsetMatch match : matches) { + charsetValues.putIfAbsent(match.getName(), new ArrayList<>()); + charsetValues.get(match.getName()).add(match.getConfidence()); + remaining.remove(match.getName()); + } + remaining.forEach(it -> { + charsetValues.putIfAbsent(it, new ArrayList<>()); + charsetValues.get(it).add(0); + }); + }); + + AtomicReference mostProbable = new AtomicReference<>(StandardCharsets.UTF_8); + AtomicReference mostProbableConfidence = new AtomicReference<>((double) 0); + charsetValues.forEach((charset, confidenceValues) -> { + double average = confidenceValues.stream().mapToInt(it -> it).average().orElse(0); + if (average > mostProbableConfidence.get()) { + mostProbable.set(Charset.forName(charset)); + mostProbableConfidence.set(average); + } + }); + + return mostProbable.get(); + } + private static Charset detectCharset(InputStream stream) throws IOException { CharsetDetector charsetDetector = new CharsetDetector(); @@ -79,6 +121,14 @@ private static Charset detectCharset(InputStream stream) throws IOException { return Charset.forName(match.getName()); } + private static CharsetMatch[] detectAllCharsets(InputStream stream) throws IOException { + CharsetDetector charsetDetector = new CharsetDetector(); + + charsetDetector.setText(stream); + + return charsetDetector.detectAll(); + } + /** * Opens a file writer, using the default charset for JPlag * @param file The file to write diff --git a/languages/cpp2/src/main/java/de/jplag/cpp2/CPPParserAdapter.java b/languages/cpp2/src/main/java/de/jplag/cpp2/CPPParserAdapter.java index c123dc29e..ed46d18e0 100644 --- a/languages/cpp2/src/main/java/de/jplag/cpp2/CPPParserAdapter.java +++ b/languages/cpp2/src/main/java/de/jplag/cpp2/CPPParserAdapter.java @@ -2,7 +2,6 @@ import java.io.File; import java.io.IOException; -import java.nio.file.Files; import java.util.ArrayList; import java.util.List; import java.util.Set; @@ -17,6 +16,7 @@ import de.jplag.TokenType; import de.jplag.cpp2.grammar.CPP14Lexer; import de.jplag.cpp2.grammar.CPP14Parser; +import de.jplag.util.FileUtils; /** * The adapter between {@link AbstractParser} and the ANTLR based parser of this language module. @@ -37,7 +37,7 @@ public List scan(Set files) throws ParsingException { this.currentFile = file; logger.trace("Parsing file {}", currentFile); try { - CPP14Lexer lexer = new CPP14Lexer(CharStreams.fromStream(Files.newInputStream(file.toPath()))); + CPP14Lexer lexer = new CPP14Lexer(CharStreams.fromReader(FileUtils.openFileReader(file))); // create a buffer of tokens pulled from the lexer CommonTokenStream tokenStream = new CommonTokenStream(lexer); CPP14Parser parser = new CPP14Parser(tokenStream); diff --git a/languages/csharp/src/main/java/de/jplag/csharp/CSharpParserAdapter.java b/languages/csharp/src/main/java/de/jplag/csharp/CSharpParserAdapter.java index a3668b1a2..3b2b0d8a2 100644 --- a/languages/csharp/src/main/java/de/jplag/csharp/CSharpParserAdapter.java +++ b/languages/csharp/src/main/java/de/jplag/csharp/CSharpParserAdapter.java @@ -1,7 +1,7 @@ package de.jplag.csharp; +import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -19,6 +19,7 @@ import de.jplag.TokenType; import de.jplag.csharp.grammar.CSharpLexer; import de.jplag.csharp.grammar.CSharpParser; +import de.jplag.util.FileUtils; /** * Parser adapter for the ANTLR 4 CSharp Parser and Lexer. It receives file to parse and passes them to the ANTLR @@ -51,11 +52,11 @@ public List parse(Set files) throws ParsingException { } private void parseFile(File file) throws ParsingException { - try (FileInputStream inputStream = new FileInputStream(file)) { + try (BufferedReader reader = FileUtils.openFileReader(file)) { currentFile = file; // create a lexer, a parser and a buffer between them. - CSharpLexer lexer = new CSharpLexer(CharStreams.fromStream(inputStream)); + CSharpLexer lexer = new CSharpLexer(CharStreams.fromReader(reader)); CommonTokenStream tokens = new CommonTokenStream(lexer); CSharpParser parser = new CSharpParser(tokens); diff --git a/languages/golang/src/main/java/de/jplag/golang/GoParserAdapter.java b/languages/golang/src/main/java/de/jplag/golang/GoParserAdapter.java index b179e0640..3d9da82c6 100644 --- a/languages/golang/src/main/java/de/jplag/golang/GoParserAdapter.java +++ b/languages/golang/src/main/java/de/jplag/golang/GoParserAdapter.java @@ -1,7 +1,7 @@ package de.jplag.golang; +import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -19,6 +19,7 @@ import de.jplag.TokenType; import de.jplag.golang.grammar.GoLexer; import de.jplag.golang.grammar.GoParser; +import de.jplag.util.FileUtils; public class GoParserAdapter extends AbstractParser { private File currentFile; @@ -34,10 +35,10 @@ public List parse(Set files) throws ParsingException { } private void parseFile(File file) throws ParsingException { - try (FileInputStream inputStream = new FileInputStream(file)) { + try (BufferedReader reader = FileUtils.openFileReader(file)) { currentFile = file; - GoLexer lexer = new GoLexer(CharStreams.fromStream(inputStream)); + GoLexer lexer = new GoLexer(CharStreams.fromReader(reader)); CommonTokenStream tokenStream = new CommonTokenStream(lexer); GoParser parser = new GoParser(tokenStream); diff --git a/languages/java/src/main/java/de/jplag/java/JavacAdapter.java b/languages/java/src/main/java/de/jplag/java/JavacAdapter.java index bd6972b87..07a501761 100644 --- a/languages/java/src/main/java/de/jplag/java/JavacAdapter.java +++ b/languages/java/src/main/java/de/jplag/java/JavacAdapter.java @@ -2,7 +2,7 @@ import java.io.File; import java.io.IOException; -import java.nio.charset.StandardCharsets; +import java.nio.charset.Charset; import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -20,6 +20,7 @@ import de.jplag.ParsingException; import de.jplag.Token; +import de.jplag.util.FileUtils; import com.sun.source.tree.CompilationUnitTree; import com.sun.source.tree.LineMap; @@ -35,7 +36,8 @@ public void parseFiles(Set files, final Parser parser) throws ParsingExcep var listener = new DiagnosticCollector<>(); List parsingExceptions = new ArrayList<>(); - try (final StandardJavaFileManager fileManager = javac.getStandardFileManager(listener, null, StandardCharsets.UTF_8)) { + final Charset guessedCharset = FileUtils.detectCharsetFromMultiple(files); + try (final StandardJavaFileManager fileManager = javac.getStandardFileManager(listener, null, guessedCharset)) { var javaFiles = fileManager.getJavaFileObjectsFromFiles(files); // We need to disable annotation processing, see diff --git a/languages/python-3/src/main/java/de/jplag/python3/Parser.java b/languages/python-3/src/main/java/de/jplag/python3/Parser.java index a24ccc15c..2dc352bfe 100644 --- a/languages/python-3/src/main/java/de/jplag/python3/Parser.java +++ b/languages/python-3/src/main/java/de/jplag/python3/Parser.java @@ -1,7 +1,7 @@ package de.jplag.python3; +import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -19,6 +19,7 @@ import de.jplag.python3.grammar.Python3Lexer; import de.jplag.python3.grammar.Python3Parser; import de.jplag.python3.grammar.Python3Parser.File_inputContext; +import de.jplag.util.FileUtils; public class Parser extends AbstractParser { @@ -43,11 +44,11 @@ public List parse(Set files) throws ParsingException { } private void parseFile(File file) throws ParsingException { - try (FileInputStream fileInputStream = new FileInputStream((file))) { + try (BufferedReader reader = FileUtils.openFileReader(file)) { currentFile = file; // create a lexer that feeds off of input CharStream - Python3Lexer lexer = new Python3Lexer(CharStreams.fromStream(fileInputStream)); + Python3Lexer lexer = new Python3Lexer(CharStreams.fromReader(reader)); // create a buffer of tokens pulled from the lexer CommonTokenStream tokens = new CommonTokenStream(lexer); diff --git a/languages/rlang/src/main/java/de/jplag/rlang/RParserAdapter.java b/languages/rlang/src/main/java/de/jplag/rlang/RParserAdapter.java index 6da61e4c1..e3c3aa6bc 100644 --- a/languages/rlang/src/main/java/de/jplag/rlang/RParserAdapter.java +++ b/languages/rlang/src/main/java/de/jplag/rlang/RParserAdapter.java @@ -1,7 +1,7 @@ package de.jplag.rlang; +import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -20,6 +20,7 @@ import de.jplag.rlang.grammar.RFilter; import de.jplag.rlang.grammar.RLexer; import de.jplag.rlang.grammar.RParser; +import de.jplag.util.FileUtils; /** * This class sets up the lexer and parser generated by ANTLR4, feeds the submissions through them and passes the @@ -52,11 +53,11 @@ public List parse(Set files) throws ParsingException { } private void parseFile(File file) throws ParsingException { - try (FileInputStream inputStream = new FileInputStream(file)) { + try (BufferedReader reader = FileUtils.openFileReader(file)) { currentFile = file; // create a lexer, a parser and a buffer between them. - RLexer lexer = new RLexer(CharStreams.fromStream(inputStream)); + RLexer lexer = new RLexer(CharStreams.fromReader(reader)); CommonTokenStream tokens = new CommonTokenStream(lexer); RFilter filter = new RFilter(tokens); diff --git a/languages/rust/src/main/java/de/jplag/rust/RustParserAdapter.java b/languages/rust/src/main/java/de/jplag/rust/RustParserAdapter.java index 5a4d9920c..092889054 100644 --- a/languages/rust/src/main/java/de/jplag/rust/RustParserAdapter.java +++ b/languages/rust/src/main/java/de/jplag/rust/RustParserAdapter.java @@ -1,7 +1,7 @@ package de.jplag.rust; +import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -18,6 +18,7 @@ import de.jplag.Token; import de.jplag.rust.grammar.RustLexer; import de.jplag.rust.grammar.RustParser; +import de.jplag.util.FileUtils; public class RustParserAdapter extends AbstractParser { @@ -39,11 +40,11 @@ public List parse(Set files) throws ParsingException { } private void parseFile(File file) throws ParsingException { - try (FileInputStream inputStream = new FileInputStream(file)) { + try (BufferedReader reader = FileUtils.openFileReader(file)) { currentFile = file; // create a lexer, a parser and a buffer between them. - RustLexer lexer = new RustLexer(CharStreams.fromStream(inputStream)); + RustLexer lexer = new RustLexer(CharStreams.fromReader(reader)); CommonTokenStream tokenStream = new CommonTokenStream(lexer); RustParser parser = new RustParser(tokenStream); diff --git a/languages/scala/src/main/scala/de/jplag/scala/Parser.scala b/languages/scala/src/main/scala/de/jplag/scala/Parser.scala index c4436eb5e..c5e2f396c 100644 --- a/languages/scala/src/main/scala/de/jplag/scala/Parser.scala +++ b/languages/scala/src/main/scala/de/jplag/scala/Parser.scala @@ -1,9 +1,11 @@ package de.jplag.scala import de.jplag.scala.ScalaTokenType._ +import de.jplag.util.FileUtils import de.jplag.{AbstractParser, ParsingException, Token} import java.io.File +import java.util.stream.Collectors import scala.collection.mutable.ListBuffer import scala.meta._ @@ -343,8 +345,7 @@ class Parser extends AbstractParser { currentFile = file try { - val bytes = java.nio.file.Files.readAllBytes(file.toPath) - val text = new String(bytes, "UTF-8") + val text = FileUtils.readFileContent(file) val input = Input.VirtualFile(file.getPath, text) val ast = input.parse[Source].get traverser(ast) diff --git a/languages/scheme/src/main/javacc/Scheme.jj b/languages/scheme/src/main/javacc/Scheme.jj index de2797daf..1b8384992 100644 --- a/languages/scheme/src/main/javacc/Scheme.jj +++ b/languages/scheme/src/main/javacc/Scheme.jj @@ -58,7 +58,10 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.nio.charset.Charset; + import de.jplag.ParsingException; +import de.jplag.util.FileUtils; public class SchemeParser { /* used for context in the template production rule */ @@ -68,10 +71,11 @@ public class SchemeParser { public static void parseFile(File file, SchemeParser parser, Parser parserX) throws ParsingException { try { FileInputStream in = new FileInputStream(file); + Charset charset = FileUtils.detectCharset(file); if (parser == null) { - parser = new SchemeParser(in, "UTF-8"); + parser = new SchemeParser(in, charset.name()); } else { - parser.ReInit(in, "UTF-8"); + parser.ReInit(in, charset.name()); } parser.parser2 = parserX; } catch (FileNotFoundException e) { diff --git a/languages/swift/src/main/java/de/jplag/swift/SwiftParserAdapter.java b/languages/swift/src/main/java/de/jplag/swift/SwiftParserAdapter.java index a6614257f..26c8025a8 100644 --- a/languages/swift/src/main/java/de/jplag/swift/SwiftParserAdapter.java +++ b/languages/swift/src/main/java/de/jplag/swift/SwiftParserAdapter.java @@ -1,7 +1,7 @@ package de.jplag.swift; +import java.io.BufferedReader; import java.io.File; -import java.io.FileInputStream; import java.io.IOException; import java.util.ArrayList; import java.util.List; @@ -18,6 +18,7 @@ import de.jplag.Token; import de.jplag.swift.grammar.Swift5Lexer; import de.jplag.swift.grammar.Swift5Parser; +import de.jplag.util.FileUtils; public class SwiftParserAdapter extends AbstractParser { @@ -47,10 +48,10 @@ public List parse(Set files) throws ParsingException { } private void parse(File file) throws ParsingException { - try (FileInputStream inputStream = new FileInputStream(file)) { + try (BufferedReader reader = FileUtils.openFileReader(file)) { currentFile = file; - Swift5Lexer lexer = new Swift5Lexer(CharStreams.fromStream(inputStream)); + Swift5Lexer lexer = new Swift5Lexer(CharStreams.fromReader(reader)); CommonTokenStream tokenStream = new CommonTokenStream(lexer); Swift5Parser parser = new Swift5Parser(tokenStream); diff --git a/languages/text/src/main/java/de/jplag/text/ParserAdapter.java b/languages/text/src/main/java/de/jplag/text/ParserAdapter.java index 975dbe7e3..c3d1ccc83 100644 --- a/languages/text/src/main/java/de/jplag/text/ParserAdapter.java +++ b/languages/text/src/main/java/de/jplag/text/ParserAdapter.java @@ -2,7 +2,6 @@ import java.io.File; import java.io.IOException; -import java.nio.file.Files; import java.util.ArrayList; import java.util.List; import java.util.Properties; @@ -11,6 +10,7 @@ import de.jplag.AbstractParser; import de.jplag.ParsingException; import de.jplag.Token; +import de.jplag.util.FileUtils; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.CoreDocument; @@ -100,7 +100,7 @@ private void addToken(CoreLabel label) { private String readFile(File file) throws ParsingException { try { - return Files.readString(file.toPath()); + return FileUtils.readFileContent(file); } catch (IOException e) { throw new ParsingException(file, e.getMessage(), e); } From 48f27eb2017f9861b17f9ca9aded2c3643068143 Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Wed, 12 Apr 2023 12:46:08 +0200 Subject: [PATCH 03/11] Added a test case for guessing the charset from multiple files. --- .../src/test/java/de/jplag/util/FileUtilTest.java | 11 +++++++++++ .../test/resources/de/jplag/fileSetEncoding/ascii1 | 1 + .../test/resources/de/jplag/fileSetEncoding/ascii2 | 1 + .../test/resources/de/jplag/fileSetEncoding/notAscii | 1 + 4 files changed, 14 insertions(+) create mode 100644 language-api/src/test/resources/de/jplag/fileSetEncoding/ascii1 create mode 100644 language-api/src/test/resources/de/jplag/fileSetEncoding/ascii2 create mode 100644 language-api/src/test/resources/de/jplag/fileSetEncoding/notAscii diff --git a/language-api/src/test/java/de/jplag/util/FileUtilTest.java b/language-api/src/test/java/de/jplag/util/FileUtilTest.java index 52ab58e6a..ff66c0b49 100644 --- a/language-api/src/test/java/de/jplag/util/FileUtilTest.java +++ b/language-api/src/test/java/de/jplag/util/FileUtilTest.java @@ -3,14 +3,18 @@ import java.io.File; import java.io.IOException; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.nio.file.Path; +import java.util.Set; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; public class FileUtilTest { private static final Path TEST_FILE_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileReaderTests"); + private static final Path TEST_FILE_SET_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileSetEncoding"); private static final String expectedFileContent = "Some ascii characters and some others: ä#+öü%&(/)?=?"; @@ -29,6 +33,13 @@ public void testCharsetDetection(File file) throws IOException { "Wrong charset assumed for: " + file.getAbsolutePath()); } + @Test + public void testDetectFromFileSet() { + Set files = Set.of(TEST_FILE_SET_LOCATION.toFile().listFiles()); + Charset encoding = FileUtils.detectCharsetFromMultiple(files); + Assertions.assertEquals(StandardCharsets.ISO_8859_1, encoding); + } + public static File[] searchTestFiles() { return TEST_FILE_LOCATION.toFile().listFiles(); } diff --git a/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii1 b/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii1 new file mode 100644 index 000000000..4d039c8df --- /dev/null +++ b/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii1 @@ -0,0 +1 @@ +some simple ascii characters \ No newline at end of file diff --git a/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii2 b/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii2 new file mode 100644 index 000000000..72af43005 --- /dev/null +++ b/language-api/src/test/resources/de/jplag/fileSetEncoding/ascii2 @@ -0,0 +1 @@ +some more ascii characters \ No newline at end of file diff --git a/language-api/src/test/resources/de/jplag/fileSetEncoding/notAscii b/language-api/src/test/resources/de/jplag/fileSetEncoding/notAscii new file mode 100644 index 000000000..3cfd99b9d --- /dev/null +++ b/language-api/src/test/resources/de/jplag/fileSetEncoding/notAscii @@ -0,0 +1 @@ +this contains a non ascii character: � From be51ee2441db7e1d3b55da144a25c3735de18570 Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Wed, 12 Apr 2023 12:59:17 +0200 Subject: [PATCH 04/11] Small error I missed when first commiting. --- languages/scheme/src/main/javacc/Scheme.jj | 2 ++ 1 file changed, 2 insertions(+) diff --git a/languages/scheme/src/main/javacc/Scheme.jj b/languages/scheme/src/main/javacc/Scheme.jj index 1b8384992..94d9b1a90 100644 --- a/languages/scheme/src/main/javacc/Scheme.jj +++ b/languages/scheme/src/main/javacc/Scheme.jj @@ -81,6 +81,8 @@ public class SchemeParser { } catch (FileNotFoundException e) { System.out.println("Scheme Parser R4RS: File " + file.getName() + " not found."); throw new ParsingException(file, e.getMessage(), e); + } catch (IOException e) { + throw new ParsingException(file, e.getMessage(), e); } try { parser.Program(); From 8e95667b1ee3cb4abecab7acda85e5d02bb84163 Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Wed, 12 Apr 2023 12:59:17 +0200 Subject: [PATCH 05/11] Small error I missed when first commiting. --- languages/scheme/src/main/javacc/Scheme.jj | 3 +++ 1 file changed, 3 insertions(+) diff --git a/languages/scheme/src/main/javacc/Scheme.jj b/languages/scheme/src/main/javacc/Scheme.jj index 1b8384992..9d09407d8 100644 --- a/languages/scheme/src/main/javacc/Scheme.jj +++ b/languages/scheme/src/main/javacc/Scheme.jj @@ -58,6 +58,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.IOException; import java.nio.charset.Charset; import de.jplag.ParsingException; @@ -81,6 +82,8 @@ public class SchemeParser { } catch (FileNotFoundException e) { System.out.println("Scheme Parser R4RS: File " + file.getName() + " not found."); throw new ParsingException(file, e.getMessage(), e); + } catch (IOException e) { + throw new ParsingException(file, e.getMessage(), e); } try { parser.Program(); From aed751f95e14a7185d9f95d86da5db75aae7c18b Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Tue, 25 Apr 2023 09:28:00 +0200 Subject: [PATCH 06/11] Implemented requested changes and made slight changes to the charset detection for sets. --- .../main/java/de/jplag/util/FileUtils.java | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java index 63f4f282c..2cbafb034 100644 --- a/language-api/src/main/java/de/jplag/util/FileUtils.java +++ b/language-api/src/main/java/de/jplag/util/FileUtils.java @@ -14,7 +14,9 @@ * Encapsulates various interactions with files to prevent issues with file encodings. */ public class FileUtils { - private static final Charset defaultOutputCharset = StandardCharsets.UTF_8; + private static final Charset DEFAULT_OUTPUT_CHARSET = StandardCharsets.UTF_8; + private static final char BOM = '\uFEFF'; + private static final int SINGLE_CHAR_BUFFER_SIZE = 10; /** * Opens a file reader, guessing the charset from the content. Also, if the file is encoded in a UTF* encoding and a bom @@ -52,8 +54,8 @@ public static String readFileContent(File file) throws IOException { */ private static void removeBom(BufferedReader reader, Charset charset) throws IOException { if (charset.name().toUpperCase().startsWith("UTF")) { - reader.mark(10); - if (reader.read() != '\uFEFF') { + reader.mark(SINGLE_CHAR_BUFFER_SIZE); + if (reader.read() != BOM) { reader.reset(); } } @@ -80,13 +82,16 @@ public static Charset detectCharset(File file) throws IOException { public static Charset detectCharsetFromMultiple(Collection files) { Map> charsetValues = new HashMap<>(); - files.stream().map(it -> { - try (InputStream stream = new BufferedInputStream(new FileInputStream(it))) { - return detectAllCharsets(stream); + List matchData = new ArrayList<>(); + for (File file : files) { + try (InputStream stream = new BufferedInputStream(new FileInputStream(file))) { + matchData.add(detectAllCharsets(stream)); } catch (IOException e) { throw new RuntimeException(e); } - }).forEach(matches -> { + } + + for (CharsetMatch[] matches : matchData) { Set remaining = new HashSet<>(Set.of(CharsetDetector.getAllDetectableCharsets())); for (CharsetMatch match : matches) { charsetValues.putIfAbsent(match.getName(), new ArrayList<>()); @@ -97,12 +102,15 @@ public static Charset detectCharsetFromMultiple(Collection files) { charsetValues.putIfAbsent(it, new ArrayList<>()); charsetValues.get(it).add(0); }); - }); + } AtomicReference mostProbable = new AtomicReference<>(StandardCharsets.UTF_8); - AtomicReference mostProbableConfidence = new AtomicReference<>((double) 0); + AtomicReference mostProbableConfidence = new AtomicReference<>(0.0); charsetValues.forEach((charset, confidenceValues) -> { double average = confidenceValues.stream().mapToInt(it -> it).average().orElse(0); + if (confidenceValues.stream().anyMatch(it -> it == 0)) { + average = 0; + } if (average > mostProbableConfidence.get()) { mostProbable.set(Charset.forName(charset)); mostProbableConfidence.set(average); @@ -136,6 +144,6 @@ private static CharsetMatch[] detectAllCharsets(InputStream stream) throws IOExc * @throws IOException If the file does not exist or is not writable */ public static Writer openFileWriter(File file) throws IOException { - return new BufferedWriter(new FileWriter(file, defaultOutputCharset)); + return new BufferedWriter(new FileWriter(file, DEFAULT_OUTPUT_CHARSET)); } } From ce323977fac608a2705fa8c8be6568d00c705b9c Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Tue, 25 Apr 2023 10:42:40 +0200 Subject: [PATCH 07/11] Fixed code smells --- language-api/src/main/java/de/jplag/util/FileUtils.java | 8 ++++++-- .../src/test/java/de/jplag/util/FileUtilTest.java | 7 ++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java index 2cbafb034..f16f343f4 100644 --- a/language-api/src/main/java/de/jplag/util/FileUtils.java +++ b/language-api/src/main/java/de/jplag/util/FileUtils.java @@ -9,6 +9,7 @@ import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; +import de.jplag.ParsingException; /** * Encapsulates various interactions with files to prevent issues with file encodings. @@ -18,6 +19,9 @@ public class FileUtils { private static final char BOM = '\uFEFF'; private static final int SINGLE_CHAR_BUFFER_SIZE = 10; + private FileUtils() { + } + /** * Opens a file reader, guessing the charset from the content. Also, if the file is encoded in a UTF* encoding and a bom * exists, it is removed from the reader. @@ -79,7 +83,7 @@ public static Charset detectCharset(File file) throws IOException { * @param files The files to check * @return The most probable charset */ - public static Charset detectCharsetFromMultiple(Collection files) { + public static Charset detectCharsetFromMultiple(Collection files) throws ParsingException { Map> charsetValues = new HashMap<>(); List matchData = new ArrayList<>(); @@ -87,7 +91,7 @@ public static Charset detectCharsetFromMultiple(Collection files) { try (InputStream stream = new BufferedInputStream(new FileInputStream(file))) { matchData.add(detectAllCharsets(stream)); } catch (IOException e) { - throw new RuntimeException(e); + throw new ParsingException(file, e); } } diff --git a/language-api/src/test/java/de/jplag/util/FileUtilTest.java b/language-api/src/test/java/de/jplag/util/FileUtilTest.java index ff66c0b49..650ff5feb 100644 --- a/language-api/src/test/java/de/jplag/util/FileUtilTest.java +++ b/language-api/src/test/java/de/jplag/util/FileUtilTest.java @@ -7,6 +7,7 @@ import java.nio.file.Path; import java.util.Set; +import de.jplag.ParsingException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; @@ -20,7 +21,7 @@ public class FileUtilTest { @ParameterizedTest @MethodSource("searchTestFiles") - public void testReadFile(File file) throws IOException { + void testReadFile(File file) throws IOException { String found = FileUtils.readFileContent(file); Assertions.assertEquals(expectedFileContent, found, "File contains unexpected content: " + file.getAbsolutePath()); @@ -28,13 +29,13 @@ public void testReadFile(File file) throws IOException { @ParameterizedTest @MethodSource("searchTestFiles") - public void testCharsetDetection(File file) throws IOException { + void testCharsetDetection(File file) throws IOException { Assertions.assertEquals(Charset.forName(file.getName()), FileUtils.detectCharset(file), "Wrong charset assumed for: " + file.getAbsolutePath()); } @Test - public void testDetectFromFileSet() { + void testDetectFromFileSet() throws ParsingException { Set files = Set.of(TEST_FILE_SET_LOCATION.toFile().listFiles()); Charset encoding = FileUtils.detectCharsetFromMultiple(files); Assertions.assertEquals(StandardCharsets.ISO_8859_1, encoding); From aae9fde342eb2e1a7b3b937ab26a0b8291ce10c5 Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Tue, 25 Apr 2023 10:42:40 +0200 Subject: [PATCH 08/11] Fixed code smells --- language-api/src/main/java/de/jplag/util/FileUtils.java | 9 +++++++-- .../src/test/java/de/jplag/util/FileUtilTest.java | 8 +++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java index 2cbafb034..e309f8ac2 100644 --- a/language-api/src/main/java/de/jplag/util/FileUtils.java +++ b/language-api/src/main/java/de/jplag/util/FileUtils.java @@ -7,6 +7,8 @@ import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; +import de.jplag.ParsingException; + import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; @@ -18,6 +20,9 @@ public class FileUtils { private static final char BOM = '\uFEFF'; private static final int SINGLE_CHAR_BUFFER_SIZE = 10; + private FileUtils() { + } + /** * Opens a file reader, guessing the charset from the content. Also, if the file is encoded in a UTF* encoding and a bom * exists, it is removed from the reader. @@ -79,7 +84,7 @@ public static Charset detectCharset(File file) throws IOException { * @param files The files to check * @return The most probable charset */ - public static Charset detectCharsetFromMultiple(Collection files) { + public static Charset detectCharsetFromMultiple(Collection files) throws ParsingException { Map> charsetValues = new HashMap<>(); List matchData = new ArrayList<>(); @@ -87,7 +92,7 @@ public static Charset detectCharsetFromMultiple(Collection files) { try (InputStream stream = new BufferedInputStream(new FileInputStream(file))) { matchData.add(detectAllCharsets(stream)); } catch (IOException e) { - throw new RuntimeException(e); + throw new ParsingException(file, e); } } diff --git a/language-api/src/test/java/de/jplag/util/FileUtilTest.java b/language-api/src/test/java/de/jplag/util/FileUtilTest.java index ff66c0b49..070b71625 100644 --- a/language-api/src/test/java/de/jplag/util/FileUtilTest.java +++ b/language-api/src/test/java/de/jplag/util/FileUtilTest.java @@ -12,6 +12,8 @@ import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.MethodSource; +import de.jplag.ParsingException; + public class FileUtilTest { private static final Path TEST_FILE_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileReaderTests"); private static final Path TEST_FILE_SET_LOCATION = Path.of("src", "test", "resources", "de", "jplag", "fileSetEncoding"); @@ -20,7 +22,7 @@ public class FileUtilTest { @ParameterizedTest @MethodSource("searchTestFiles") - public void testReadFile(File file) throws IOException { + void testReadFile(File file) throws IOException { String found = FileUtils.readFileContent(file); Assertions.assertEquals(expectedFileContent, found, "File contains unexpected content: " + file.getAbsolutePath()); @@ -28,13 +30,13 @@ public void testReadFile(File file) throws IOException { @ParameterizedTest @MethodSource("searchTestFiles") - public void testCharsetDetection(File file) throws IOException { + void testCharsetDetection(File file) throws IOException { Assertions.assertEquals(Charset.forName(file.getName()), FileUtils.detectCharset(file), "Wrong charset assumed for: " + file.getAbsolutePath()); } @Test - public void testDetectFromFileSet() { + void testDetectFromFileSet() throws ParsingException { Set files = Set.of(TEST_FILE_SET_LOCATION.toFile().listFiles()); Charset encoding = FileUtils.detectCharsetFromMultiple(files); Assertions.assertEquals(StandardCharsets.ISO_8859_1, encoding); From e64e538eb29d2937045a7cb92924113afa817ba6 Mon Sep 17 00:00:00 2001 From: Alexander Milster Date: Tue, 25 Apr 2023 11:29:00 +0200 Subject: [PATCH 09/11] Spotless second attempt --- language-api/src/main/java/de/jplag/util/FileUtils.java | 1 - language-api/src/test/java/de/jplag/util/FileUtilTest.java | 1 - 2 files changed, 2 deletions(-) diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java index d24748f9a..e309f8ac2 100644 --- a/language-api/src/main/java/de/jplag/util/FileUtils.java +++ b/language-api/src/main/java/de/jplag/util/FileUtils.java @@ -11,7 +11,6 @@ import com.ibm.icu.text.CharsetDetector; import com.ibm.icu.text.CharsetMatch; -import de.jplag.ParsingException; /** * Encapsulates various interactions with files to prevent issues with file encodings. diff --git a/language-api/src/test/java/de/jplag/util/FileUtilTest.java b/language-api/src/test/java/de/jplag/util/FileUtilTest.java index 0e555e923..070b71625 100644 --- a/language-api/src/test/java/de/jplag/util/FileUtilTest.java +++ b/language-api/src/test/java/de/jplag/util/FileUtilTest.java @@ -7,7 +7,6 @@ import java.nio.file.Path; import java.util.Set; -import de.jplag.ParsingException; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; import org.junit.jupiter.params.ParameterizedTest; From cb5fb00bf78a159a5fddc0f26583439b9d90a26f Mon Sep 17 00:00:00 2001 From: TwoOfTwelve <101133883+TwoOfTwelve@users.noreply.github.com> Date: Wed, 26 Apr 2023 06:18:13 +0000 Subject: [PATCH 10/11] Better name for BYTE_ORDER_MARK. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Dominik Fuchß --- language-api/src/main/java/de/jplag/util/FileUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java index e309f8ac2..7d09b0c75 100644 --- a/language-api/src/main/java/de/jplag/util/FileUtils.java +++ b/language-api/src/main/java/de/jplag/util/FileUtils.java @@ -17,7 +17,7 @@ */ public class FileUtils { private static final Charset DEFAULT_OUTPUT_CHARSET = StandardCharsets.UTF_8; - private static final char BOM = '\uFEFF'; + private static final char BYTE_ORDER_MARK = '\uFEFF'; private static final int SINGLE_CHAR_BUFFER_SIZE = 10; private FileUtils() { From 67f2cc7ebbf63724f91f7377a82e2569d137da09 Mon Sep 17 00:00:00 2001 From: TwoOfTwelve <101133883+TwoOfTwelve@users.noreply.github.com> Date: Wed, 26 Apr 2023 06:24:44 +0000 Subject: [PATCH 11/11] Fixed compiler error --- language-api/src/main/java/de/jplag/util/FileUtils.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/language-api/src/main/java/de/jplag/util/FileUtils.java b/language-api/src/main/java/de/jplag/util/FileUtils.java index 7d09b0c75..311674cbd 100644 --- a/language-api/src/main/java/de/jplag/util/FileUtils.java +++ b/language-api/src/main/java/de/jplag/util/FileUtils.java @@ -60,7 +60,7 @@ public static String readFileContent(File file) throws IOException { private static void removeBom(BufferedReader reader, Charset charset) throws IOException { if (charset.name().toUpperCase().startsWith("UTF")) { reader.mark(SINGLE_CHAR_BUFFER_SIZE); - if (reader.read() != BOM) { + if (reader.read() != BYTE_ORDER_MARK) { reader.reset(); } }