Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: lazy vcf overreading into next record #224

Merged
merged 3 commits into from
Jan 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion noodles-vcf/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

### Fixed

* vcf/lazy/record/bounds: Fix range for info field ([#224]).
* vcf/lazy/record/bounds: Fix range for info field ([#223]).

* vcf/reader/lazy_record: Disallow newlines to appear in fields ([#224]).

[#223]: https://github.com/zaeleus/noodles/pull/223
[#224]: https://github.com/zaeleus/noodles/pull/224

## 0.48.0 - 2023-12-14
Expand Down
89 changes: 20 additions & 69 deletions noodles-vcf/src/async/reader.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
mod header;
mod query;

use std::str;

use futures::{stream, Stream};
use noodles_bgzf as bgzf;
use noodles_core::Region;
Expand Down Expand Up @@ -228,7 +226,7 @@ where
/// # }
/// ```
pub async fn read_lazy_record(&mut self, record: &mut lazy::Record) -> io::Result<usize> {
read_lazy_record(&mut self.inner, record).await
read_lazy_record(&mut self.inner, &mut self.buf, record).await
}

/// Returns an (async) stream over records starting from the current (input) stream position.
Expand Down Expand Up @@ -412,76 +410,22 @@ where
}
}

async fn read_lazy_record<R>(reader: &mut R, record: &mut lazy::Record) -> io::Result<usize>
async fn read_lazy_record<R>(
reader: &mut R,
buf: &mut String,
record: &mut lazy::Record,
) -> io::Result<usize>
where
R: AsyncBufRead + Unpin,
{
record.buf.clear();

let mut len = 0;

len += read_field(reader, &mut record.buf).await?;
record.bounds.chromosome_end = record.buf.len();

len += read_field(reader, &mut record.buf).await?;
record.bounds.position_end = record.buf.len();

len += read_field(reader, &mut record.buf).await?;
record.bounds.ids_end = record.buf.len();

len += read_field(reader, &mut record.buf).await?;
record.bounds.reference_bases_end = record.buf.len();

len += read_field(reader, &mut record.buf).await?;
record.bounds.alternate_bases_end = record.buf.len();
buf.clear();

len += read_field(reader, &mut record.buf).await?;
record.bounds.quality_score_end = record.buf.len();

len += read_field(reader, &mut record.buf).await?;
record.bounds.filters_end = record.buf.len();

len += read_field(reader, &mut record.buf).await?;
record.bounds.info_end = record.buf.len();

len += read_line(reader, &mut record.buf).await?;

Ok(len)
}

async fn read_field<R>(reader: &mut R, dst: &mut String) -> io::Result<usize>
where
R: AsyncBufRead + Unpin,
{
const DELIMITER: u8 = b'\t';

let mut is_delimiter = false;
let mut len = 0;

loop {
let src = reader.fill_buf().await?;

if is_delimiter || src.is_empty() {
break;
}

let (buf, n) = match src.iter().position(|&b| b == DELIMITER) {
Some(i) => {
is_delimiter = true;
(&src[..i], i + 1)
}
None => (src, src.len()),
};

let s = str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
dst.push_str(s);

len += n;

reader.consume(n);
if reader.read_line(buf).await? == 0 {
return Ok(0);
}

Ok(len)
let mut buf = buf.as_bytes();
crate::reader::lazy_record::read_lazy_record(&mut buf, record)
}

#[cfg(test)]
Expand All @@ -508,10 +452,11 @@ mod tests {

#[tokio::test]
async fn test_read_lazy_record() -> io::Result<()> {
let mut src = &b"sq0\t1\t.\tA\t.\t.\tPASS\t."[..];
let mut src = &b"sq0\t1\t.\tA\t.\t.\tPASS\t.\n"[..];
let mut buf = String::new();

let mut record = lazy::Record::default();
read_lazy_record(&mut src, &mut record).await?;
read_lazy_record(&mut src, &mut buf, &mut record).await?;

assert_eq!(record.buf, "sq01.A..PASS.");

Expand All @@ -524,6 +469,12 @@ mod tests {
assert_eq!(record.bounds.filters_end, 12);
assert_eq!(record.bounds.info_end, 13);

let mut src = &b"\n"[..];
assert!(matches!(
read_lazy_record(&mut src, &mut buf, &mut record).await,
Err(e) if e.kind() == io::ErrorKind::InvalidData,
));

Ok(())
}
}
98 changes: 3 additions & 95 deletions noodles-vcf/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

mod builder;
mod header;
pub(crate) mod lazy_record;
pub(crate) mod query;
pub mod record;
mod records;

use crate::lazy;

use self::lazy_record::read_lazy_record;
pub(crate) use self::record::parse_record;
pub use self::{builder::Builder, query::Query, records::Records};
use crate::lazy;

use std::{
io::{self, BufRead, Read, Seek},
Expand Down Expand Up @@ -392,78 +393,6 @@ where
}
}

fn read_lazy_record<R>(reader: &mut R, record: &mut lazy::Record) -> io::Result<usize>
where
R: BufRead,
{
record.buf.clear();

let mut len = 0;

len += read_field(reader, &mut record.buf)?;
record.bounds.chromosome_end = record.buf.len();

len += read_field(reader, &mut record.buf)?;
record.bounds.position_end = record.buf.len();

len += read_field(reader, &mut record.buf)?;
record.bounds.ids_end = record.buf.len();

len += read_field(reader, &mut record.buf)?;
record.bounds.reference_bases_end = record.buf.len();

len += read_field(reader, &mut record.buf)?;
record.bounds.alternate_bases_end = record.buf.len();

len += read_field(reader, &mut record.buf)?;
record.bounds.quality_score_end = record.buf.len();

len += read_field(reader, &mut record.buf)?;
record.bounds.filters_end = record.buf.len();

len += read_field(reader, &mut record.buf)?;
record.bounds.info_end = record.buf.len();

len += read_line(reader, &mut record.buf)?;

Ok(len)
}

fn read_field<R>(reader: &mut R, dst: &mut String) -> io::Result<usize>
where
R: BufRead,
{
const DELIMITER: u8 = b'\t';

let mut is_delimiter = false;
let mut len = 0;

loop {
let src = reader.fill_buf()?;

if is_delimiter || src.is_empty() {
break;
}

let (buf, n) = match src.iter().position(|&b| b == DELIMITER) {
Some(i) => {
is_delimiter = true;
(&src[..i], i + 1)
}
None => (src, src.len()),
};

let s = str::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
dst.push_str(s);

len += n;

reader.consume(n);
}

Ok(len)
}

pub(crate) fn resolve_region<I>(index: &I, region: &Region) -> io::Result<(usize, Vec<u8>)>
where
I: BinningIndex,
Expand Down Expand Up @@ -541,25 +470,4 @@ sq0\t1\t.\tA\t.\t.\tPASS\t.

Ok(())
}

#[test]
fn test_read_lazy_record() -> io::Result<()> {
let mut src = &b"sq0\t1\t.\tA\t.\t.\tPASS\t."[..];

let mut record = lazy::Record::default();
read_lazy_record(&mut src, &mut record)?;

assert_eq!(record.buf, "sq01.A..PASS.");

assert_eq!(record.bounds.chromosome_end, 3);
assert_eq!(record.bounds.position_end, 4);
assert_eq!(record.bounds.ids_end, 5);
assert_eq!(record.bounds.reference_bases_end, 6);
assert_eq!(record.bounds.alternate_bases_end, 7);
assert_eq!(record.bounds.quality_score_end, 8);
assert_eq!(record.bounds.filters_end, 12);
assert_eq!(record.bounds.info_end, 13);

Ok(())
}
}
Loading