-
Notifications
You must be signed in to change notification settings - Fork 244
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add new class VCFHeaderReader (#1148)
* adding a new class VCFHeaderReader to read a VCFHeader from a stream without knowing in advance if the file is VCF or BCF, or compressed or not. * part of #1112
- Loading branch information
1 parent
52ec082
commit 8b55de6
Showing
3 changed files
with
110 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
package htsjdk.variant.utils; | ||
|
||
import htsjdk.samtools.SamStreams; | ||
import htsjdk.samtools.cram.io.InputStreamUtils; | ||
import htsjdk.samtools.seekablestream.SeekableStream; | ||
import htsjdk.tribble.Feature; | ||
import htsjdk.tribble.FeatureCodec; | ||
import htsjdk.tribble.FeatureCodecHeader; | ||
import htsjdk.tribble.TribbleException; | ||
import htsjdk.variant.bcf2.BCF2Codec; | ||
import htsjdk.variant.bcf2.BCFVersion; | ||
import htsjdk.variant.vcf.VCFCodec; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
|
||
import java.io.BufferedInputStream; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.util.Arrays; | ||
import java.util.zip.GZIPInputStream; | ||
|
||
/** | ||
* Utility class to read a VCF header without being told beforehand whether the input is VCF or BCF. | ||
*/ | ||
public final class VCFHeaderReader { | ||
|
||
private VCFHeaderReader(){} | ||
|
||
/** | ||
* Read a VCF header from a stream that may be a VCF file (possibly gzip or block compressed) or a BCF file. | ||
* After successfully reading a header the stream is positioned immediately after the header, otherwise, if an | ||
* exception is thrown, the state of the stream is undefined. | ||
* | ||
* @param in the stream to read the header from | ||
* @return the VCF header read from the stream | ||
* @throws TribbleException.InvalidHeader if the header in the file is invalid | ||
* @throws IOException if an IOException occurs while reading the header | ||
*/ | ||
public static VCFHeader readHeaderFrom(final SeekableStream in) throws IOException { | ||
final long initialPos = in.position(); | ||
byte[] magicBytes = InputStreamUtils.readFully(bufferAndDecompressIfNecessary(in), BCFVersion.MAGIC_HEADER_START.length); | ||
in.seek(initialPos); | ||
if (magicBytes[0] == '#') { // VCF | ||
return readHeaderFrom(in, new VCFCodec()); | ||
} else if (Arrays.equals(magicBytes, BCFVersion.MAGIC_HEADER_START)) { | ||
return readHeaderFrom(in, new BCF2Codec()); | ||
} | ||
throw new TribbleException.InvalidHeader("No VCF header found in " + in.getSource()); | ||
} | ||
|
||
private static InputStream bufferAndDecompressIfNecessary(final InputStream in) throws IOException { | ||
BufferedInputStream bis = new BufferedInputStream(in); | ||
// despite the name, SamStreams.isGzippedSAMFile looks for any gzipped stream (including block compressed) | ||
return SamStreams.isGzippedSAMFile(bis) ? new GZIPInputStream(bis) : bis; | ||
} | ||
|
||
private static <FEATURE_TYPE extends Feature, SOURCE> VCFHeader readHeaderFrom(final InputStream in, final FeatureCodec<FEATURE_TYPE, SOURCE> featureCodec) throws IOException { | ||
InputStream is = bufferAndDecompressIfNecessary(in); | ||
FeatureCodecHeader headerCodec = featureCodec.readHeader(featureCodec.makeSourceFromStream(is)); | ||
return (VCFHeader) headerCodec.getHeaderValue(); | ||
} | ||
} |
46 changes: 46 additions & 0 deletions
46
src/test/java/htsjdk/variant/utils/VCFHeaderReaderTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
package htsjdk.variant.utils; | ||
|
||
import htsjdk.HtsjdkTest; | ||
import htsjdk.samtools.seekablestream.SeekableFileStream; | ||
import htsjdk.tribble.TribbleException; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
import org.testng.Assert; | ||
import org.testng.annotations.DataProvider; | ||
import org.testng.annotations.Test; | ||
|
||
import java.io.File; | ||
import java.io.FileNotFoundException; | ||
import java.io.IOException; | ||
|
||
public class VCFHeaderReaderTest extends HtsjdkTest { | ||
@DataProvider(name = "files") | ||
Object[][] pathsData() { | ||
|
||
final String TEST_DATA_DIR = "src/test/resources/htsjdk/variant/"; | ||
return new Object[][]{ | ||
{TEST_DATA_DIR + "VcfThatLacksAnIndex.bcf"}, | ||
{TEST_DATA_DIR + "VcfThatLacksAnIndex.vcf"}, | ||
{TEST_DATA_DIR + "VcfThatLacksAnIndex.vcf.bgz"}, | ||
{TEST_DATA_DIR + "VcfThatLacksAnIndex.vcf.gz"}, | ||
}; | ||
} | ||
|
||
@Test(dataProvider = "files") | ||
public void testReadHeaderFrom(final String file) throws IOException { | ||
VCFHeader vcfHeader = VCFHeaderReader.readHeaderFrom(new SeekableFileStream(new File(file))); | ||
Assert.assertNotNull(vcfHeader); | ||
} | ||
|
||
@DataProvider | ||
public Object[][] invalidFiles(){ | ||
return new Object[][] { | ||
{ new File("src/test/resources/htsjdk/samtools/empty.bam")}, | ||
{new File("src/test/resources/htsjdk/variant/corrupt_file_that_starts_with_#.vcf")} | ||
}; | ||
} | ||
|
||
@Test(dataProvider = "invalidFiles", expectedExceptions = TribbleException.InvalidHeader.class) | ||
public void testReadHeaderForInvalidFile(File file) throws IOException { | ||
VCFHeaderReader.readHeaderFrom(new SeekableFileStream(file)); | ||
} | ||
} |
3 changes: 3 additions & 0 deletions
3
src/test/resources/htsjdk/variant/corrupt_file_that_starts_with_#.vcf
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
# | ||
is not | ||
a vcf |