forked from broadinstitute/gatk
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added a simple TSV/CSV/XSV writer with cloud write support as an alte…
…rnative to TableWriter (broadinstitute#5930)
- Loading branch information
1 parent
81f4918
commit 92f953c
Showing
2 changed files
with
308 additions
and
0 deletions.
There are no files selected for viewing
213 changes: 213 additions & 0 deletions
213
src/main/java/org/broadinstitute/hellbender/utils/tsv/SimpleXSVWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
package org.broadinstitute.hellbender.utils.tsv; | ||
|
||
import com.opencsv.CSVWriter; | ||
import org.broadinstitute.hellbender.exceptions.GATKException; | ||
import org.broadinstitute.hellbender.utils.Utils; | ||
|
||
import java.io.Closeable; | ||
import java.io.IOException; | ||
import java.io.OutputStreamWriter; | ||
import java.io.Writer; | ||
import java.nio.file.Files; | ||
import java.nio.file.Path; | ||
import java.util.*; | ||
|
||
/** | ||
* A simple TSV/CSV/XSV writer with support for writing in the cloud with configurable delimiter. | ||
* | ||
* The expected use case for this class is that first {@link #setHeaderLine} is called with a list of the column names | ||
* which will be used to determine the number of columns per line as well as how the header is indexed. Then in order to | ||
* construct a new line call {@link #getNewLineBuilder} to get a line builder for each line, which then has convienent | ||
* methods for individually assigning column values based on the header line etc. Once a line is finished being mutated | ||
* one simply needs to call write() on the line to validate and finalize the line. | ||
* | ||
* Header lines are encoded in the same format as each row, a single row of delimeted column titles as the first row in the table. | ||
* | ||
* Note: this class is intended for creating XSV files with loosely defined input types. If there exists a well defined object | ||
* that summarizes your table data points then consider using {@link TableWriter}. | ||
*/ | ||
public class SimpleXSVWriter implements Closeable { | ||
private int expectedNumColumns; | ||
private Map<String, Integer> headerMap = null; | ||
private CSVWriter outputWriter; | ||
|
||
// The current incomplete line in the writer. | ||
private LineBuilder currentLineBuilder = null; | ||
|
||
/** | ||
* Creates a new table writer given the file and column names. | ||
* | ||
* @param path the destination path. This could be a cloud uri (ex. gs://...) | ||
* @param separator separator to use for the XSV file | ||
* @throws IOException if one was raised when opening the the destination file for writing. | ||
*/ | ||
public SimpleXSVWriter(final Path path, final char separator) throws IOException { | ||
this( new OutputStreamWriter( | ||
Files.newOutputStream(Utils.nonNull(path, "The path cannot be null."))), | ||
separator); | ||
} | ||
|
||
/** | ||
* Creates a new table writer given an initialized writer and column names. | ||
* | ||
* @param writer the destination writer. | ||
* @param separator separator to use for the TSV file | ||
* @throws IOException if one was raised when opening the the destination file for writing. | ||
*/ | ||
public SimpleXSVWriter(final Writer writer, final char separator) { | ||
Utils.validate(separator!='\n', "Column separator cannot be a newline character"); | ||
outputWriter = new CSVWriter(writer, separator); | ||
} | ||
|
||
/** | ||
* Provides a header line to the XSV output file. Note that this will throw an exception if all header lines | ||
* are not unique as it attempts to create an index for the provided header lines for convenience when building | ||
* rows of the XSV. | ||
* | ||
* NOTE: This can only be set once, XSV output files are expected to only have a single row as header. | ||
* | ||
* @param columns Ordered list of header lines to be built into the XSV | ||
*/ | ||
public void setHeaderLine(List<String> columns) { | ||
if (headerMap != null) { | ||
throw new GATKException("Cannot modify header line once set"); | ||
} | ||
outputWriter.writeNext(columns.toArray(new String[0]), false); | ||
expectedNumColumns = columns.size(); | ||
|
||
// Create the mapping between header and column | ||
headerMap = new HashMap<>(); | ||
for (int i = 0; i < columns.size(); i++) { | ||
Utils.nonNull(columns.get(i), "Provided header had null column at position: " + i); | ||
if (headerMap.putIfAbsent(columns.get(i), i) != null) { | ||
throw new GATKException("Column names must be unique, but found a duplicate name: " + columns.get(i)); | ||
} | ||
} | ||
} | ||
|
||
private void writeLine(String[] line) { | ||
outputWriter.writeNext(line, false); | ||
currentLineBuilder = null; | ||
} | ||
|
||
/** | ||
* Builds a new LineBuilder and writes out the previous line if it exists. | ||
* | ||
* @return a blank LineBuilder to allow for defining the next line | ||
*/ | ||
public LineBuilder getNewLineBuilder() { | ||
if (headerMap == null) { | ||
throw new GATKException("Cannot construct line without first setting the header line"); | ||
} | ||
if (currentLineBuilder != null) { | ||
currentLineBuilder.write(); | ||
} | ||
currentLineBuilder = new LineBuilder(expectedNumColumns); | ||
return currentLineBuilder; | ||
} | ||
|
||
/** | ||
* @param column header line to get index for | ||
* @return zero based index corresponding to that header string, throws an exception if the headerline doesn't exist | ||
*/ | ||
public Integer getIndexForColumn(String column) { | ||
Utils.nonNull(headerMap, "Cannot request column index if the header has not been specified"); | ||
Integer index = headerMap.get(column); | ||
Utils.nonNull(index, "Requested column " + column + " does not exist in the provided header"); | ||
return index; | ||
} | ||
|
||
@Override | ||
public void close() throws IOException { | ||
if (currentLineBuilder != null) { | ||
currentLineBuilder.write(); | ||
} | ||
outputWriter.close(); | ||
} | ||
|
||
/** | ||
* Helper to allow for incremental construction of a body line using either indexes or column headings | ||
* <p> | ||
* Calling build() will cause the line to be written out into the underlying CSV writer in its current state. Doing | ||
* so will result in a validation call where an exception will be thrown if any columns of the current line have | ||
* not been defined. fill() can be used to provide a default value for undefined columns. | ||
*/ | ||
public class LineBuilder { | ||
String[] lineToBuild; | ||
boolean hasBuilt = false; | ||
|
||
LineBuilder(int lineLength) { | ||
lineToBuild = new String[lineLength]; | ||
} | ||
|
||
/** | ||
* @param row complete line corresponding to this row of the tsv | ||
*/ | ||
public LineBuilder setRow(final String[] row) { | ||
checkAlterationAfterWrite(); | ||
Utils.validate(row.length == lineToBuild.length, "Provided line must have the correct number of columns"); | ||
for (int i = 0; i < row.length; i++) { | ||
lineToBuild[i] = row[i]; | ||
} | ||
return this; | ||
} | ||
|
||
/** | ||
* @param row complete line corresponding to this row of the tsv | ||
*/ | ||
public LineBuilder setRow(final List<String> row) { | ||
checkAlterationAfterWrite(); | ||
Utils.validate(row.size() == lineToBuild.length, "Provided line must have the correct number of columns"); | ||
for (int i = 0; i < row.size(); i++) { | ||
lineToBuild[i] = row.get(i); | ||
} | ||
return this; | ||
} | ||
|
||
/** | ||
* @param index Column index to be set | ||
* @param value Value to be placed into the line | ||
*/ | ||
public LineBuilder setColumn(final int index, final String value) { | ||
checkAlterationAfterWrite(); | ||
lineToBuild[index] = value; | ||
return this; | ||
} | ||
|
||
/** | ||
* @param heading Column heading to be set | ||
* @param value Value to be placed into the line | ||
*/ | ||
public LineBuilder setColumn(final String heading, final String value) { | ||
int index = getIndexForColumn(heading); | ||
return setColumn(index, value); | ||
} | ||
|
||
/** | ||
* Fills in every empty column of the pending line with the provided value | ||
*/ | ||
public LineBuilder fill(final String filling) { | ||
checkAlterationAfterWrite(); | ||
for (int i = 0; i < lineToBuild.length; i++) { | ||
if (lineToBuild[i] == null) { | ||
lineToBuild[i] = filling; | ||
} | ||
} | ||
return this; | ||
} | ||
|
||
/** | ||
* Constructs the line and writes it out to the output | ||
*/ | ||
public void write() { | ||
Utils.validate(!Arrays.stream(lineToBuild).anyMatch(Objects::isNull), "Attempted to construct an incomplete line, make sure all columns are filled"); | ||
writeLine(lineToBuild); | ||
hasBuilt = true; | ||
} | ||
|
||
// Throw an exception if we try to alter an already written out line | ||
private void checkAlterationAfterWrite() { | ||
Utils.validate(!hasBuilt, "Cannot make alterations to an already written out CSV line"); | ||
} | ||
} | ||
} |
95 changes: 95 additions & 0 deletions
95
...ava/org/broadinstitute/hellbender/utils/tsv/SimpleCSVWriterWrapperWithHeaderUnitTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
package org.broadinstitute.hellbender.utils.tsv; | ||
|
||
import htsjdk.samtools.util.BufferedLineReader; | ||
import htsjdk.samtools.util.IOUtil; | ||
import htsjdk.samtools.util.LineReader; | ||
import org.broadinstitute.hellbender.GATKBaseTest; | ||
import org.broadinstitute.hellbender.exceptions.GATKException; | ||
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec; | ||
import org.broadinstitute.hellbender.utils.gcs.BucketUtils; | ||
import org.broadinstitute.hellbender.utils.io.IOUtils; | ||
import org.testng.Assert; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.nio.file.Path; | ||
import java.util.Arrays; | ||
|
||
public class SimpleCSVWriterWrapperWithHeaderUnitTest extends GATKBaseTest { | ||
|
||
@Test (groups = "bucket") | ||
public void testWriteToBucketPathEquivalentToLocalPath() throws IOException { | ||
Path bucketPath = IOUtils.getPath(BucketUtils.getTempFilePath( | ||
getGCPTestStaging() + "testWriteToBucketPathEquivalentToLocalPath", ".tsv")); | ||
Path localPath = IOUtils.getPath(createTempFile("testWriteToBucketPathEquivalentToLocalPath", ".tsv").getPath()); | ||
|
||
try (SimpleXSVWriter bucketWriter = new SimpleXSVWriter(bucketPath, '\t'); | ||
SimpleXSVWriter localWriter = new SimpleXSVWriter(localPath, '\t')) { | ||
|
||
String[] header = new String[]{"a", "b", "c"}; | ||
bucketWriter.setHeaderLine(Arrays.asList(header)); | ||
localWriter.setHeaderLine(Arrays.asList(header)); | ||
|
||
for (int i = 0; i < 100; i++) { | ||
SimpleXSVWriter.LineBuilder bucketLine = bucketWriter.getNewLineBuilder(); | ||
SimpleXSVWriter.LineBuilder localLine = localWriter.getNewLineBuilder(); | ||
int finalI = i; | ||
Arrays.stream(header).forEach(column -> { | ||
bucketLine.setColumn(column, Integer.toString(finalI)); | ||
localLine.setColumn(column, Integer.toString(finalI)); | ||
}); | ||
} | ||
} | ||
IntegrationTestSpec.assertEqualTextFiles(bucketPath, localPath, null); | ||
} | ||
|
||
@Test | ||
public void testFillingInBlankLines() throws IOException { | ||
Path outputPath = IOUtils.getPath(createTempFile("testWriteToBucketPathEquivalentToLocalPath", ".csv").getPath()); | ||
|
||
try (SimpleXSVWriter localWriter = new SimpleXSVWriter(outputPath, ',')) { | ||
String[] header = new String[]{"a", "b", "c","d"}; | ||
localWriter.setHeaderLine(Arrays.asList(header)); | ||
|
||
for (int i = 0; i < 100; i++) { | ||
SimpleXSVWriter.LineBuilder localLine = localWriter.getNewLineBuilder(); | ||
Arrays.stream(header).forEach(column -> { | ||
localLine.setColumn("b", "10"); | ||
localLine.fill("0"); | ||
localLine.setColumn("d", "1"); | ||
}); | ||
} | ||
} | ||
|
||
try (final FileInputStream fis= new FileInputStream(outputPath.toFile()); | ||
final BufferedLineReader br = new BufferedLineReader(fis)) { | ||
Assert.assertEquals(br.readLine(), "a,b,c,d"); | ||
int lineCount = 0; | ||
while (lineCount++ < 100) { | ||
Assert.assertEquals(br.readLine(), "0,10,0,1"); | ||
} | ||
} | ||
} | ||
|
||
@Test (expectedExceptions = IllegalStateException.class) | ||
public void testWrongNumberOfLines() throws IOException { | ||
Path outputPath = IOUtils.getPath(createTempFile("testWriteToBucketPathEquivalentToLocalPath", ".csv").getPath()); | ||
|
||
try (SimpleXSVWriter localWriter = new SimpleXSVWriter(outputPath, ',')) { | ||
String[] header = new String[]{"a", "b", "c","d"}; | ||
localWriter.setHeaderLine(Arrays.asList(header)); | ||
|
||
localWriter.getNewLineBuilder().setRow(Arrays.asList("1","2","3","4","5")); | ||
} | ||
} | ||
|
||
@Test (expectedExceptions = GATKException.class) | ||
public void testMissingHeader() throws IOException { | ||
Path outputPath = IOUtils.getPath(createTempFile("testWriteToBucketPathEquivalentToLocalPath", ".csv").getPath()); | ||
|
||
try (SimpleXSVWriter localWriter = new SimpleXSVWriter(outputPath, ',')) { | ||
localWriter.getNewLineBuilder(); | ||
} | ||
} | ||
} |