Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add an optimised version of CramContainerIterator #1129

Merged
merged 2 commits into from
Jun 5, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package htsjdk.samtools.cram.build;

import htsjdk.samtools.cram.common.Version;
import htsjdk.samtools.cram.io.CountingInputStream;
import htsjdk.samtools.cram.io.InputStreamUtils;
import htsjdk.samtools.cram.structure.Container;
import htsjdk.samtools.cram.structure.ContainerIO;

import java.io.IOException;
import java.io.InputStream;

/**
* Iterate over CRAM containers from an input stream, and unlike {@link CramContainerIterator} only
* the header of each container is read, rather than the whole stream. As a result, the container block
* data is *not* populated, including the compression header block and slices.
*
* This class is useful when you are not interested in the contents of containers, for example when indexing container
* start positions.
*/
public class CramContainerHeaderIterator extends CramContainerIterator {

public CramContainerHeaderIterator(final InputStream inputStream) throws IOException {
super(inputStream);
}

protected Container containerFromStream(final Version cramVersion, final CountingInputStream countingStream) throws IOException {
final Container container = ContainerIO.readContainerHeader(cramVersion.major, countingStream);
InputStreamUtils.skipFully(countingStream, container.containerByteSize);
return container;
}

}
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package htsjdk.samtools.cram.build;

import htsjdk.samtools.cram.common.Version;
import htsjdk.samtools.cram.io.CountingInputStream;
import htsjdk.samtools.cram.structure.Container;
import htsjdk.samtools.cram.structure.ContainerIO;
Expand All @@ -14,21 +15,21 @@
*/
public class CramContainerIterator implements Iterator<Container> {
private CramHeader cramHeader;
private InputStream inputStream;
private CountingInputStream countingInputStream;
private Container nextContainer;
private boolean eof = false;
private long offset = 0;

public CramContainerIterator(final InputStream inputStream) throws IOException {
cramHeader = CramIO.readCramHeader(inputStream);
this.inputStream = inputStream;
this.countingInputStream = new CountingInputStream(inputStream);
cramHeader = CramIO.readCramHeader(countingInputStream);
this.offset = countingInputStream.getCount();
}

void readNextContainer() {
try {
final CountingInputStream cis = new CountingInputStream(inputStream);
nextContainer = ContainerIO.readContainer(cramHeader.getVersion(), cis);
final long containerSizeInBytes = cis.getCount();
nextContainer = containerFromStream(cramHeader.getVersion(), countingInputStream);
final long containerSizeInBytes = countingInputStream.getCount();

nextContainer.offset = offset;
offset += containerSizeInBytes;
Expand All @@ -42,6 +43,17 @@ void readNextContainer() {
}
}

/**
* Consume the entirety of the next container from the stream.
* @param cramVersion
* @param countingStream
* @return The next Container from the stream.
* @throws IOException
*/
protected Container containerFromStream(final Version cramVersion, final CountingInputStream countingStream) throws IOException {
return ContainerIO.readContainer(cramHeader.getVersion(), countingStream);
}

@Override
public boolean hasNext() {
if (eof) return false;
Expand Down Expand Up @@ -70,7 +82,7 @@ public void close() {
cramHeader = null;
//noinspection EmptyCatchBlock
try {
inputStream.close();
countingInputStream.close();
} catch (final Exception e) {
}
}
Expand Down
25 changes: 25 additions & 0 deletions src/main/java/htsjdk/samtools/cram/io/InputStreamUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -77,4 +77,29 @@ public static void readFully(final InputStream inputStream, final byte[] b, fina
n += count;
}
}

/**
* Skip the specified number of bytes from the {@link InputStream}.
* @param in the input stream to skip bytes from
* @param length the number of bytes to skip
* @throws IOException as per java IO contract
* @throws EOFException if there is less than length bytes in the stream
*/
public static void skipFully(final InputStream in, final long length) throws IOException {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a positive test, plus one that fails to consume the entire length.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

long amt = length;
while (amt > 0) {
long ret = in.skip(amt);
if (ret == 0) {
// skip may return 0 even if we're not at EOF. Luckily, we can
// use the read() method to figure out if we're at the end.
int b = in.read();
if (b == -1) {
throw new EOFException( "Premature EOF from inputStream after " +
"skipping " + (length - amt) + " byte(s).");
}
ret = 1;
}
amt -= ret;
}
}
}
57 changes: 57 additions & 0 deletions src/test/java/htsjdk/samtools/CramContainerHeaderIteratorTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package htsjdk.samtools;

import htsjdk.HtsjdkTest;
import htsjdk.samtools.cram.build.CramContainerHeaderIterator;
import htsjdk.samtools.cram.build.CramContainerIterator;
import htsjdk.samtools.cram.structure.Container;
import htsjdk.samtools.cram.structure.CramHeader;
import htsjdk.samtools.seekablestream.SeekableFileStream;
import htsjdk.samtools.util.Iterables;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.File;
import java.io.IOException;
import java.util.List;

public class CramContainerHeaderIteratorTest extends HtsjdkTest {
@Test
public void test() throws IOException {
final File cramFile = new File("src/test/resources/htsjdk/samtools/cram/NA12878.20.21.1-100.100-SeqsPerSlice.0-unMapped.cram");
CramHeader expectedHeader;
List<Container> fullContainers;
try (SeekableFileStream seekableFileStream = new SeekableFileStream(cramFile)) {
CramContainerIterator iterator = new CramContainerIterator(seekableFileStream);
expectedHeader = iterator.getCramHeader();
fullContainers = Iterables.slurp(iterator);
}
CramHeader actualHeader;
List<Container> headerOnlyContainers;
try (SeekableFileStream seekableFileStream = new SeekableFileStream(cramFile)) {
CramContainerHeaderIterator iterator = new CramContainerHeaderIterator(seekableFileStream);
actualHeader = iterator.getCramHeader();
headerOnlyContainers = Iterables.slurp(iterator);
}
Assert.assertEquals(actualHeader, expectedHeader);
Assert.assertEquals(fullContainers.size(), headerOnlyContainers.size());
for (int i = 0; i < fullContainers.size(); i++) {
Container fullContainer = fullContainers.get(i);
Container headerOnlyContainer = headerOnlyContainers.get(i);
Assert.assertEquals(headerOnlyContainer.containerByteSize, fullContainer.containerByteSize);
Assert.assertEquals(headerOnlyContainer.sequenceId, fullContainer.sequenceId);
Assert.assertEquals(headerOnlyContainer.alignmentStart, fullContainer.alignmentStart);
Assert.assertEquals(headerOnlyContainer.alignmentSpan, fullContainer.alignmentSpan);
Assert.assertEquals(headerOnlyContainer.nofRecords, fullContainer.nofRecords);
Assert.assertEquals(headerOnlyContainer.globalRecordCounter, fullContainer.globalRecordCounter);
Assert.assertEquals(headerOnlyContainer.bases, fullContainer.bases);
Assert.assertEquals(headerOnlyContainer.blockCount, fullContainer.blockCount);
Assert.assertEquals(headerOnlyContainer.landmarks, fullContainer.landmarks);
Assert.assertEquals(headerOnlyContainer.checksum, fullContainer.checksum);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add an equality assert here for container.offset.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Adding the assert exposes the offset issue, and your fix resolves it.

Assert.assertEquals(headerOnlyContainer.offset, fullContainer.offset);
// unpopulated fields
Assert.assertNull(headerOnlyContainer.blocks);
Assert.assertNull(headerOnlyContainer.header);
Assert.assertNull(headerOnlyContainer.slices);
}
}
}
32 changes: 32 additions & 0 deletions src/test/java/htsjdk/samtools/cram/io/InputStreamUtilsTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package htsjdk.samtools.cram.io;

import htsjdk.HtsjdkTest;
import org.testng.Assert;
import org.testng.annotations.Test;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;

public class InputStreamUtilsTest extends HtsjdkTest {

@Test
public void testSkipFully() throws IOException {
byte[] data = new byte[] { 0, 1, 2, 3, 4, 5, 6, 7 };
// use a BufferedInputStream with a small buffer to check that skipFully will fill the buffer as needed
InputStream in = new BufferedInputStream(new ByteArrayInputStream(data), 4);
InputStreamUtils.skipFully(in, 6);
Assert.assertEquals(in.read(), 6);
Assert.assertEquals(in.read(), 7);
Assert.assertEquals(in.read(), -1); // EOF
}

@Test(expectedExceptions = EOFException.class)
public void testSkipFullyPastEOF() throws IOException {
byte[] data = new byte[] { 0, 1, 2, 3, 4, 5, 6, 7 };
InputStream in = new BufferedInputStream(new ByteArrayInputStream(data), 4);
InputStreamUtils.skipFully(in, 10);
}
}