Skip to content

Commit

Permalink
Merge pull request #231 from dkpro/enhancement/24-delete-.bin-files-a…
Browse files Browse the repository at this point in the history
…fter-dump-creation

#24 - Automatically delete *.bin files after dump creation process
  • Loading branch information
reckart authored Oct 19, 2023
2 parents 4b5d2d2 + a435d8b commit c2cad8f
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@
package de.tudarmstadt.ukp.wikipedia.datamachine.dump.xml;

import java.io.BufferedOutputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.zip.GZIPOutputStream;

import de.tudarmstadt.ukp.wikipedia.datamachine.domain.DataMachineFiles;
import de.tudarmstadt.ukp.wikipedia.datamachine.file.DeleteFilesAtShutdown;
import de.tudarmstadt.ukp.wikipedia.mwdumper.importer.DumpWriter;
import de.tudarmstadt.ukp.wikipedia.mwdumper.importer.Page;
import de.tudarmstadt.ukp.wikipedia.mwdumper.importer.Revision;
Expand All @@ -41,31 +45,48 @@ public class SimpleBinaryDumpWriter implements DumpWriter {
private Page currentPage;
private Revision lastRevision;

public SimpleBinaryDumpWriter(DataMachineFiles files) throws IOException {
this.files = files;
if (this.files.isCompressGeneratedFiles()) {
createCompressed();
} else {
createUncompressed();
}
}

protected void createUncompressed() throws IOException {
pageFile = new UTFDataOutputStream(new BufferedOutputStream(new FileOutputStream(files.getGeneratedPage())));
revisionFile = new UTFDataOutputStream(
new BufferedOutputStream(new FileOutputStream(files.getGeneratedRevision())));
textFile = new UTFDataOutputStream(new BufferedOutputStream(new FileOutputStream(files.getGeneratedText())));
pageFile = openUTFDataOutputStream(files.getGeneratedPage(), false);
revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), false);
textFile = openUTFDataOutputStream(files.getGeneratedText(), false);
}

protected void createCompressed() throws IOException {

pageFile = new UTFDataOutputStream(
new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(files.getGeneratedPage()))));
revisionFile = new UTFDataOutputStream(
new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(files.getGeneratedRevision()))));
textFile = new UTFDataOutputStream(
new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(files.getGeneratedText()))));

pageFile = openUTFDataOutputStream(files.getGeneratedPage(), true);
revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), true);
textFile = openUTFDataOutputStream(files.getGeneratedText(), true);
}

public SimpleBinaryDumpWriter(DataMachineFiles files) throws IOException {
this.files = files;
if (this.files.isCompressGeneratedFiles()) {
createCompressed();
private UTFDataOutputStream openUTFDataOutputStream(final String filePath, final boolean compressed) throws IOException {
UTFDataOutputStream utfDataOutputStream;
if(compressed) {
utfDataOutputStream = new UTFDataOutputStream(new GZIPOutputStream(openFileStreamAndRegisterDeletion(filePath)));
} else {
createUncompressed();
utfDataOutputStream = new UTFDataOutputStream(openFileStreamAndRegisterDeletion(filePath));
}
return utfDataOutputStream;
}

private BufferedOutputStream openFileStreamAndRegisterDeletion(final String filePath) throws IOException {
Path binaryOutputFilePath = Paths.get(filePath);
// JavaDoc says:
// "truncate and overwrite an existing file, or create the file if it doesn't initially exist"
OutputStream fileOutputStream = Files.newOutputStream(binaryOutputFilePath);

// Register a delete hook on JVM shutdown for this path
DeleteFilesAtShutdown.register(binaryOutputFilePath);

// Create a buffered version for this
return new BufferedOutputStream(fileOutputStream);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.wikipedia.datamachine.file;

import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

/**
* A file deletion "watch dog" that can be to remove files via its {@link Path} references. It will clean out files
* upon JVM shutdown: guaranteed!
* <p>
* Inspired by and adapted from the answer here:
* <a href="https://stackoverflow.com/a/42389029">https://stackoverflow.com/a/42389029</a>
*/
public final class DeleteFilesAtShutdown {
private static Set<Path> paths = new LinkedHashSet<>();

static {
// registers the call of 'shutdownHook' at JVM shutdown
Runtime.getRuntime().addShutdownHook(new Thread(DeleteFilesAtShutdown::cleanupRegisteredFiles));
}

private static void cleanupRegisteredFiles() {
Set<Path> local;
synchronized(DeleteFilesAtShutdown.class){
local = paths;
paths = null;
}

List<Path> toBeDeleted = new ArrayList<>(local);
Collections.reverse(toBeDeleted);
for (Path p : toBeDeleted) {
try {
Files.delete(p);
} catch (IOException | RuntimeException e) {
// do nothing - best-effort
}
}
}

/**
* Registers a {@link Path} to be removed at JVM shutdown.
* @param filePath A valid path pointing to a file.
*/
public static synchronized void register(Path filePath) {
if (paths == null) {
throw new IllegalStateException("Shutdown hook is already in progress. Adding paths is not allowed now!");
}
paths.add(filePath);
}
}

0 comments on commit c2cad8f

Please sign in to comment.