Skip to content

Commit

Permalink
vcf/reader/header: Parse header line by line
Browse files Browse the repository at this point in the history
The header parser can now build a `vcf::Header` by parsing a raw header
line by line. This makes it so that it is no longer required to read the
entire raw header into memory before parsing.
  • Loading branch information
zaeleus committed Oct 25, 2023
1 parent 1f5fa65 commit 45d91ca
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 58 deletions.
6 changes: 6 additions & 0 deletions noodles-vcf/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
Change instantiations of `vcf::reader::Builder` to
`vcf::reader::Builder::default()`.

* vcf/reader/header: Parse header line by line.

The header parser can now build a `vcf::Header` by parsing a raw header
line by line. This makes it so that it is no longer required to read the
entire raw header into memory before parsing.

* vcf/writer/builder: Add `Builder::build_from_writer`.

* vcf/writer/builder: Add a compression method setter.
Expand Down
155 changes: 97 additions & 58 deletions noodles-vcf/src/reader/header.rs
Original file line number Diff line number Diff line change
@@ -1,79 +1,96 @@
use std::io::{self, BufRead};
use std::{
io::{self, BufRead},
str,
};

use crate::Header;
use crate::{header, Header};

pub(super) fn read_header<R>(reader: &mut R) -> io::Result<Header>
where
R: BufRead,
{
read_raw_header(reader).and_then(|s| {
s.parse()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))
})
let mut parser = header::Parser::default();
let mut buf = Vec::new();

while read_header_line(reader, &mut buf)? != 0 {
let s = str::from_utf8(&buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

parser
.parse_partial(s)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
}

parser
.finish()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
}

fn read_raw_header<R>(reader: &mut R) -> io::Result<String>
fn read_header_line<R>(reader: &mut R, dst: &mut Vec<u8>) -> io::Result<usize>
where
R: BufRead,
{
use memchr::memchr;

const HEADER_PREFIX: u8 = b'#';
const PREFIX: u8 = b'#';
const LINE_FEED: u8 = b'\n';
const CARRIAGE_RETURN: u8 = b'\r';

let mut buf = Vec::new();

let mut is_first_line = true;
let mut is_eol = false;
let src = reader.fill_buf()?;

loop {
let src = reader.fill_buf()?;

let is_eof = src.is_empty();
let is_end_of_header = || (is_first_line || is_eol) && src[0] != HEADER_PREFIX;
if src.is_empty() || src[0] != PREFIX {
return Ok(0);
}

if is_eof || is_end_of_header() {
break;
}
dst.clear();

let (read_eol, len) = if let Some(i) = memchr(LINE_FEED, src) {
buf.extend(&src[..=i]);
(true, i + 1)
} else {
buf.extend(src);
(false, src.len())
};
match reader.read_until(LINE_FEED, dst)? {
0 => Ok(0),
n => {
if dst.ends_with(&[LINE_FEED]) {
dst.pop();

is_first_line = false;
is_eol = read_eol;
if dst.ends_with(&[CARRIAGE_RETURN]) {
dst.pop();
}
}

reader.consume(len);
Ok(n)
}
}

String::from_utf8(buf).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
}

#[cfg(test)]
mod tests {
use super::*;

fn collect_lines<R>(reader: &mut R) -> io::Result<Vec<Vec<u8>>>
where
R: BufRead,
{
let mut buf = Vec::new();
let mut lines = Vec::new();

while read_header_line(reader, &mut buf)? != 0 {
lines.push(buf.clone());
}

Ok(lines)
}

#[test]
fn test_read_raw_header() -> io::Result<()> {
static DATA: &[u8] = b"\
##fileformat=VCFv4.3
let src = b"##fileformat=VCFv4.3
##fileDate=20200501
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
sq0\t1\t.\tA\t.\t.\tPASS\t.
";

let mut reader = DATA;
let mut reader = &src[..];

let actual = read_raw_header(&mut reader)?;
let expected = "\
##fileformat=VCFv4.3
##fileDate=20200501
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
";
let actual = collect_lines(&mut reader)?;
let expected = [
b"##fileformat=VCFv4.3".to_vec(),
b"##fileDate=20200501".to_vec(),
b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO".to_vec(),
];

assert_eq!(actual, expected);

Expand All @@ -82,10 +99,17 @@ sq0\t1\t.\tA\t.\t.\tPASS\t.

#[test]
fn test_read_raw_header_with_no_records() -> io::Result<()> {
let expected = "##fileformat=VCFv4.3\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n";
let src = b"##fileformat=VCFv4.3
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
";

let mut reader = expected.as_bytes();
let actual = read_raw_header(&mut reader)?;
let mut reader = &src[..];

let actual = collect_lines(&mut reader)?;
let expected = [
b"##fileformat=VCFv4.3".to_vec(),
b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO".to_vec(),
];

assert_eq!(actual, expected);

Expand All @@ -96,10 +120,16 @@ sq0\t1\t.\tA\t.\t.\tPASS\t.
fn test_read_raw_header_with_multiple_buffer_fills() -> io::Result<()> {
use std::io::BufReader;

let expected = "##fileformat=VCFv4.3\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n";
let src = b"##fileformat=VCFv4.3
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO
";

let mut reader = BufReader::with_capacity(16, expected.as_bytes());
let actual = read_raw_header(&mut reader)?;
let mut reader = BufReader::with_capacity(16, &src[..]);
let actual = collect_lines(&mut reader)?;
let expected = [
b"##fileformat=VCFv4.3".to_vec(),
b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO".to_vec(),
];

assert_eq!(actual, expected);

Expand All @@ -108,25 +138,34 @@ sq0\t1\t.\tA\t.\t.\tPASS\t.

#[test]
fn test_read_raw_header_with_no_header() -> io::Result<()> {
let data = [];
let mut reader = &data[..];
let actual = read_raw_header(&mut reader)?;
let src = [];
let mut reader = &src[..];
let actual = collect_lines(&mut reader)?;
assert!(actual.is_empty());

let data = b"sq0\t1\t.\tA\t.\t.\tPASS\t.\n";
let mut reader = &data[..];
let actual = read_raw_header(&mut reader)?;
let src = b"sq0\t1\t.\tA\t.\t.\tPASS\t.\n";
let mut reader = &src[..];
let actual = collect_lines(&mut reader)?;
assert!(actual.is_empty());

Ok(())
}

#[test]
fn test_read_raw_header_with_missing_end_of_line() -> io::Result<()> {
let expected = "##fileformat=VCFv4.3\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";
let mut reader = expected.as_bytes();
let actual = read_raw_header(&mut reader)?;
let src = b"##fileformat=VCFv4.3
#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO";

let mut reader = &src[..];

let actual = collect_lines(&mut reader)?;
let expected = [
b"##fileformat=VCFv4.3".to_vec(),
b"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO".to_vec(),
];

assert_eq!(actual, expected);

Ok(())
}
}

0 comments on commit 45d91ca

Please sign in to comment.