Skip to content

Commit

Permalink
Add new class VCFHeaderReader (#1148)
Browse files Browse the repository at this point in the history
* adding a new class VCFHeaderReader to read a VCFHeader from a stream without knowing in advance if the file is VCF or BCF, or compressed or not.
* part of #1112
  • Loading branch information
lbergelson authored Jun 14, 2018
1 parent 52ec082 commit 8b55de6
Show file tree
Hide file tree
Showing 3 changed files with 110 additions and 0 deletions.
61 changes: 61 additions & 0 deletions src/main/java/htsjdk/variant/utils/VCFHeaderReader.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
package htsjdk.variant.utils;

import htsjdk.samtools.SamStreams;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.tribble.Feature;
import htsjdk.tribble.FeatureCodec;
import htsjdk.tribble.FeatureCodecHeader;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.bcf2.BCF2Codec;
import htsjdk.variant.bcf2.BCFVersion;
import htsjdk.variant.vcf.VCFCodec;
import htsjdk.variant.vcf.VCFHeader;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.zip.GZIPInputStream;

/**
* Utility class to read a VCF header without being told beforehand whether the input is VCF or BCF.
*/
public final class VCFHeaderReader {

private VCFHeaderReader(){}

/**
* Read a VCF header from a stream that may be a VCF file (possibly gzip or block compressed) or a BCF file.
* After successfully reading a header the stream is positioned immediately after the header, otherwise, if an
* exception is thrown, the state of the stream is undefined.
*
* @param in the stream to read the header from
* @return the VCF header read from the stream
* @throws TribbleException.InvalidHeader if the header in the file is invalid
* @throws IOException if an IOException occurs while reading the header
*/
public static VCFHeader readHeaderFrom(final SeekableStream in) throws IOException {
final long initialPos = in.position();
byte[] magicBytes = InputStreamUtils.readFully(bufferAndDecompressIfNecessary(in), BCFVersion.MAGIC_HEADER_START.length);
in.seek(initialPos);
if (magicBytes[0] == '#') { // VCF
return readHeaderFrom(in, new VCFCodec());
} else if (Arrays.equals(magicBytes, BCFVersion.MAGIC_HEADER_START)) {
return readHeaderFrom(in, new BCF2Codec());
}
throw new TribbleException.InvalidHeader("No VCF header found in " + in.getSource());
}

private static InputStream bufferAndDecompressIfNecessary(final InputStream in) throws IOException {
BufferedInputStream bis = new BufferedInputStream(in);
// despite the name, SamStreams.isGzippedSAMFile looks for any gzipped stream (including block compressed)
return SamStreams.isGzippedSAMFile(bis) ? new GZIPInputStream(bis) : bis;
}

private static <FEATURE_TYPE extends Feature, SOURCE> VCFHeader readHeaderFrom(final InputStream in, final FeatureCodec<FEATURE_TYPE, SOURCE> featureCodec) throws IOException {
InputStream is = bufferAndDecompressIfNecessary(in);
FeatureCodecHeader headerCodec = featureCodec.readHeader(featureCodec.makeSourceFromStream(is));
return (VCFHeader) headerCodec.getHeaderValue();
}
}
46 changes: 46 additions & 0 deletions src/test/java/htsjdk/variant/utils/VCFHeaderReaderTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
package htsjdk.variant.utils;

import htsjdk.HtsjdkTest;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.tribble.TribbleException;
import htsjdk.variant.vcf.VCFHeader;
import org.testng.Assert;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;

public class VCFHeaderReaderTest extends HtsjdkTest {
@DataProvider(name = "files")
Object[][] pathsData() {

final String TEST_DATA_DIR = "src/test/resources/htsjdk/variant/";
return new Object[][]{
{TEST_DATA_DIR + "VcfThatLacksAnIndex.bcf"},
{TEST_DATA_DIR + "VcfThatLacksAnIndex.vcf"},
{TEST_DATA_DIR + "VcfThatLacksAnIndex.vcf.bgz"},
{TEST_DATA_DIR + "VcfThatLacksAnIndex.vcf.gz"},
};
}

@Test(dataProvider = "files")
public void testReadHeaderFrom(final String file) throws IOException {
VCFHeader vcfHeader = VCFHeaderReader.readHeaderFrom(new SeekableFileStream(new File(file)));
Assert.assertNotNull(vcfHeader);
}

@DataProvider
public Object[][] invalidFiles(){
return new Object[][] {
{ new File("src/test/resources/htsjdk/samtools/empty.bam")},
{new File("src/test/resources/htsjdk/variant/corrupt_file_that_starts_with_#.vcf")}
};
}

@Test(dataProvider = "invalidFiles", expectedExceptions = TribbleException.InvalidHeader.class)
public void testReadHeaderForInvalidFile(File file) throws IOException {
VCFHeaderReader.readHeaderFrom(new SeekableFileStream(file));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#
is not
a vcf

0 comments on commit 8b55de6

Please sign in to comment.