Skip to content

Commit

Permalink
bcf/async/io/reader/header: Parse header line by line
Browse files Browse the repository at this point in the history
The async header reader now builds a `vcf::Header` by parsing a raw
header line by line. This makes it so that it is no longer required to
read the entire raw header into memory before parsing.
  • Loading branch information
zaeleus committed Jan 1, 2025
1 parent ee1a98f commit 4d6c950
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 9 deletions.
6 changes: 6 additions & 0 deletions noodles-bcf/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@

### Changed

* bcf/async/io/reader/header: Parse header line by line.

The async header reader now builds a `vcf::Header` by parsing a raw header
line by line. This makes it so that it is no longer required to read the
entire raw header into memory before parsing.

* bcf/async/io/reader/header: Discard VCF header padding.

## 0.66.0 - 2024-12-20
Expand Down
54 changes: 45 additions & 9 deletions noodles-bcf/src/async/io/reader/header.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ mod format_version;
mod magic_number;
mod vcf_header;

use noodles_vcf as vcf;
use tokio::io::{self, AsyncRead, AsyncReadExt};
use noodles_vcf::{self as vcf, header::StringMaps};
use tokio::io::{self, AsyncBufRead, AsyncBufReadExt, AsyncRead, AsyncReadExt};

use self::{format_version::read_format_version, magic_number::read_magic_number};
use crate::MAGIC_NUMBER;
Expand Down Expand Up @@ -61,21 +61,57 @@ async fn read_vcf_header<R>(reader: &mut vcf_header::Reader<R>) -> io::Result<vc
where
R: AsyncRead + Unpin,
{
let mut raw_header = String::new();
reader.read_to_string(&mut raw_header).await?;
let mut parser = vcf::header::Parser::default();
let mut string_maps = StringMaps::default();

let mut buf = Vec::new();

while read_line(reader, &mut buf).await? != 0 {
let entry = parser
.parse_partial(&buf)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

string_maps
.insert_entry(&entry)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
}

reader.discard_to_end().await?;

let mut header: vcf::Header = raw_header
.parse()
let mut header = parser
.finish()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;

*header.string_maps_mut() = raw_header
.parse()
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
*header.string_maps_mut() = string_maps;

Ok(header)
}

async fn read_line<R>(reader: &mut R, dst: &mut Vec<u8>) -> io::Result<usize>
where
R: AsyncBufRead + Unpin,
{
const LINE_FEED: u8 = b'\n';
const CARRIAGE_RETURN: u8 = b'\r';

dst.clear();

match reader.read_until(LINE_FEED, dst).await? {
0 => Ok(0),
n => {
if dst.ends_with(&[LINE_FEED]) {
dst.pop();

if dst.ends_with(&[CARRIAGE_RETURN]) {
dst.pop();
}
}

Ok(n)
}
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down

0 comments on commit 4d6c950

Please sign in to comment.