Skip to content

Commit

Permalink
Added a simple TSV/CSV/XSV writer with cloud write support as an alte…
Browse files Browse the repository at this point in the history
…rnative to TableWriter (broadinstitute#5930)
  • Loading branch information
jamesemery authored and rori committed May 15, 2019
1 parent 81f4918 commit 92f953c
Show file tree
Hide file tree
Showing 2 changed files with 308 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
package org.broadinstitute.hellbender.utils.tsv;

import com.opencsv.CSVWriter;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.utils.Utils;

import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.*;

/**
* A simple TSV/CSV/XSV writer with support for writing in the cloud with configurable delimiter.
*
* The expected use case for this class is that first {@link #setHeaderLine} is called with a list of the column names
* which will be used to determine the number of columns per line as well as how the header is indexed. Then in order to
* construct a new line call {@link #getNewLineBuilder} to get a line builder for each line, which then has convienent
* methods for individually assigning column values based on the header line etc. Once a line is finished being mutated
* one simply needs to call write() on the line to validate and finalize the line.
*
* Header lines are encoded in the same format as each row, a single row of delimeted column titles as the first row in the table.
*
* Note: this class is intended for creating XSV files with loosely defined input types. If there exists a well defined object
* that summarizes your table data points then consider using {@link TableWriter}.
*/
public class SimpleXSVWriter implements Closeable {
private int expectedNumColumns;
private Map<String, Integer> headerMap = null;
private CSVWriter outputWriter;

// The current incomplete line in the writer.
private LineBuilder currentLineBuilder = null;

/**
* Creates a new table writer given the file and column names.
*
* @param path the destination path. This could be a cloud uri (ex. gs://...)
* @param separator separator to use for the XSV file
* @throws IOException if one was raised when opening the the destination file for writing.
*/
public SimpleXSVWriter(final Path path, final char separator) throws IOException {
this( new OutputStreamWriter(
Files.newOutputStream(Utils.nonNull(path, "The path cannot be null."))),
separator);
}

/**
* Creates a new table writer given an initialized writer and column names.
*
* @param writer the destination writer.
* @param separator separator to use for the TSV file
* @throws IOException if one was raised when opening the the destination file for writing.
*/
public SimpleXSVWriter(final Writer writer, final char separator) {
Utils.validate(separator!='\n', "Column separator cannot be a newline character");
outputWriter = new CSVWriter(writer, separator);
}

/**
* Provides a header line to the XSV output file. Note that this will throw an exception if all header lines
* are not unique as it attempts to create an index for the provided header lines for convenience when building
* rows of the XSV.
*
* NOTE: This can only be set once, XSV output files are expected to only have a single row as header.
*
* @param columns Ordered list of header lines to be built into the XSV
*/
public void setHeaderLine(List<String> columns) {
if (headerMap != null) {
throw new GATKException("Cannot modify header line once set");
}
outputWriter.writeNext(columns.toArray(new String[0]), false);
expectedNumColumns = columns.size();

// Create the mapping between header and column
headerMap = new HashMap<>();
for (int i = 0; i < columns.size(); i++) {
Utils.nonNull(columns.get(i), "Provided header had null column at position: " + i);
if (headerMap.putIfAbsent(columns.get(i), i) != null) {
throw new GATKException("Column names must be unique, but found a duplicate name: " + columns.get(i));
}
}
}

private void writeLine(String[] line) {
outputWriter.writeNext(line, false);
currentLineBuilder = null;
}

/**
* Builds a new LineBuilder and writes out the previous line if it exists.
*
* @return a blank LineBuilder to allow for defining the next line
*/
public LineBuilder getNewLineBuilder() {
if (headerMap == null) {
throw new GATKException("Cannot construct line without first setting the header line");
}
if (currentLineBuilder != null) {
currentLineBuilder.write();
}
currentLineBuilder = new LineBuilder(expectedNumColumns);
return currentLineBuilder;
}

/**
* @param column header line to get index for
* @return zero based index corresponding to that header string, throws an exception if the headerline doesn't exist
*/
public Integer getIndexForColumn(String column) {
Utils.nonNull(headerMap, "Cannot request column index if the header has not been specified");
Integer index = headerMap.get(column);
Utils.nonNull(index, "Requested column " + column + " does not exist in the provided header");
return index;
}

@Override
public void close() throws IOException {
if (currentLineBuilder != null) {
currentLineBuilder.write();
}
outputWriter.close();
}

/**
* Helper to allow for incremental construction of a body line using either indexes or column headings
* <p>
* Calling build() will cause the line to be written out into the underlying CSV writer in its current state. Doing
* so will result in a validation call where an exception will be thrown if any columns of the current line have
* not been defined. fill() can be used to provide a default value for undefined columns.
*/
public class LineBuilder {
String[] lineToBuild;
boolean hasBuilt = false;

LineBuilder(int lineLength) {
lineToBuild = new String[lineLength];
}

/**
* @param row complete line corresponding to this row of the tsv
*/
public LineBuilder setRow(final String[] row) {
checkAlterationAfterWrite();
Utils.validate(row.length == lineToBuild.length, "Provided line must have the correct number of columns");
for (int i = 0; i < row.length; i++) {
lineToBuild[i] = row[i];
}
return this;
}

/**
* @param row complete line corresponding to this row of the tsv
*/
public LineBuilder setRow(final List<String> row) {
checkAlterationAfterWrite();
Utils.validate(row.size() == lineToBuild.length, "Provided line must have the correct number of columns");
for (int i = 0; i < row.size(); i++) {
lineToBuild[i] = row.get(i);
}
return this;
}

/**
* @param index Column index to be set
* @param value Value to be placed into the line
*/
public LineBuilder setColumn(final int index, final String value) {
checkAlterationAfterWrite();
lineToBuild[index] = value;
return this;
}

/**
* @param heading Column heading to be set
* @param value Value to be placed into the line
*/
public LineBuilder setColumn(final String heading, final String value) {
int index = getIndexForColumn(heading);
return setColumn(index, value);
}

/**
* Fills in every empty column of the pending line with the provided value
*/
public LineBuilder fill(final String filling) {
checkAlterationAfterWrite();
for (int i = 0; i < lineToBuild.length; i++) {
if (lineToBuild[i] == null) {
lineToBuild[i] = filling;
}
}
return this;
}

/**
* Constructs the line and writes it out to the output
*/
public void write() {
Utils.validate(!Arrays.stream(lineToBuild).anyMatch(Objects::isNull), "Attempted to construct an incomplete line, make sure all columns are filled");
writeLine(lineToBuild);
hasBuilt = true;
}

// Throw an exception if we try to alter an already written out line
private void checkAlterationAfterWrite() {
Utils.validate(!hasBuilt, "Cannot make alterations to an already written out CSV line");
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
package org.broadinstitute.hellbender.utils.tsv;

import htsjdk.samtools.util.BufferedLineReader;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.LineReader;
import org.broadinstitute.hellbender.GATKBaseTest;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.testutils.IntegrationTestSpec;
import org.broadinstitute.hellbender.utils.gcs.BucketUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Path;
import java.util.Arrays;

public class SimpleCSVWriterWrapperWithHeaderUnitTest extends GATKBaseTest {

@Test (groups = "bucket")
public void testWriteToBucketPathEquivalentToLocalPath() throws IOException {
Path bucketPath = IOUtils.getPath(BucketUtils.getTempFilePath(
getGCPTestStaging() + "testWriteToBucketPathEquivalentToLocalPath", ".tsv"));
Path localPath = IOUtils.getPath(createTempFile("testWriteToBucketPathEquivalentToLocalPath", ".tsv").getPath());

try (SimpleXSVWriter bucketWriter = new SimpleXSVWriter(bucketPath, '\t');
SimpleXSVWriter localWriter = new SimpleXSVWriter(localPath, '\t')) {

String[] header = new String[]{"a", "b", "c"};
bucketWriter.setHeaderLine(Arrays.asList(header));
localWriter.setHeaderLine(Arrays.asList(header));

for (int i = 0; i < 100; i++) {
SimpleXSVWriter.LineBuilder bucketLine = bucketWriter.getNewLineBuilder();
SimpleXSVWriter.LineBuilder localLine = localWriter.getNewLineBuilder();
int finalI = i;
Arrays.stream(header).forEach(column -> {
bucketLine.setColumn(column, Integer.toString(finalI));
localLine.setColumn(column, Integer.toString(finalI));
});
}
}
IntegrationTestSpec.assertEqualTextFiles(bucketPath, localPath, null);
}

@Test
public void testFillingInBlankLines() throws IOException {
Path outputPath = IOUtils.getPath(createTempFile("testWriteToBucketPathEquivalentToLocalPath", ".csv").getPath());

try (SimpleXSVWriter localWriter = new SimpleXSVWriter(outputPath, ',')) {
String[] header = new String[]{"a", "b", "c","d"};
localWriter.setHeaderLine(Arrays.asList(header));

for (int i = 0; i < 100; i++) {
SimpleXSVWriter.LineBuilder localLine = localWriter.getNewLineBuilder();
Arrays.stream(header).forEach(column -> {
localLine.setColumn("b", "10");
localLine.fill("0");
localLine.setColumn("d", "1");
});
}
}

try (final FileInputStream fis= new FileInputStream(outputPath.toFile());
final BufferedLineReader br = new BufferedLineReader(fis)) {
Assert.assertEquals(br.readLine(), "a,b,c,d");
int lineCount = 0;
while (lineCount++ < 100) {
Assert.assertEquals(br.readLine(), "0,10,0,1");
}
}
}

@Test (expectedExceptions = IllegalStateException.class)
public void testWrongNumberOfLines() throws IOException {
Path outputPath = IOUtils.getPath(createTempFile("testWriteToBucketPathEquivalentToLocalPath", ".csv").getPath());

try (SimpleXSVWriter localWriter = new SimpleXSVWriter(outputPath, ',')) {
String[] header = new String[]{"a", "b", "c","d"};
localWriter.setHeaderLine(Arrays.asList(header));

localWriter.getNewLineBuilder().setRow(Arrays.asList("1","2","3","4","5"));
}
}

@Test (expectedExceptions = GATKException.class)
public void testMissingHeader() throws IOException {
Path outputPath = IOUtils.getPath(createTempFile("testWriteToBucketPathEquivalentToLocalPath", ".csv").getPath());

try (SimpleXSVWriter localWriter = new SimpleXSVWriter(outputPath, ',')) {
localWriter.getNewLineBuilder();
}
}
}

0 comments on commit 92f953c

Please sign in to comment.