From dc09abc5af64a9e664dd93f3edd0cefc7afad1d0 Mon Sep 17 00:00:00 2001 From: Michael Macias Date: Thu, 1 Feb 2024 09:57:04 -0600 Subject: [PATCH] bcf/record: Read static fields from buffer --- noodles-bcf/src/async/io/reader.rs | 12 ++-- noodles-bcf/src/async/io/reader/query.rs | 7 +- noodles-bcf/src/async/io/reader/record.rs | 32 ++++----- noodles-bcf/src/io/reader.rs | 2 +- noodles-bcf/src/io/reader/record.rs | 28 ++++---- noodles-bcf/src/record.rs | 44 +++++++++---- noodles-bcf/src/record/convert.rs | 6 +- noodles-bcf/src/record/fields.rs | 80 +++++++++++++++++++++++ noodles-bcf/src/record/fields/bounds.rs | 9 +++ 9 files changed, 165 insertions(+), 55 deletions(-) create mode 100644 noodles-bcf/src/record/fields.rs create mode 100644 noodles-bcf/src/record/fields/bounds.rs diff --git a/noodles-bcf/src/async/io/reader.rs b/noodles-bcf/src/async/io/reader.rs index b6c33b501..6ce03a309 100644 --- a/noodles-bcf/src/async/io/reader.rs +++ b/noodles-bcf/src/async/io/reader.rs @@ -41,7 +41,6 @@ use crate::{ /// ``` pub struct Reader { inner: R, - buf: Vec, string_maps: StringMaps, } @@ -154,7 +153,7 @@ where /// # } /// ``` pub async fn read_record(&mut self, record: &mut Record) -> io::Result { - read_record(&mut self.inner, &mut self.buf, record).await + read_record(&mut self.inner, record).await } /// Returns an (async) stream over lazy records starting from the current (input) stream @@ -187,13 +186,13 @@ where /// ``` pub fn records(&mut self) -> impl Stream> + '_ { Box::pin(stream::try_unfold( - (&mut self.inner, Vec::new(), Record::default()), - |(mut reader, mut buf, mut record)| async { - read_record(&mut reader, &mut buf, &mut record) + (&mut self.inner, Record::default()), + |(mut reader, mut record)| async { + read_record(&mut reader, &mut record) .await .map(|n| match n { 0 => None, - _ => Some((record.clone(), (reader, buf, record))), + _ => Some((record.clone(), (reader, record))), }) }, )) @@ -321,7 +320,6 @@ impl From for Reader { fn from(inner: R) -> Self { Self { inner, - buf: Vec::new(), string_maps: StringMaps::default(), } } diff --git a/noodles-bcf/src/async/io/reader/query.rs b/noodles-bcf/src/async/io/reader/query.rs index 5c2595d92..0a8dc2f6d 100644 --- a/noodles-bcf/src/async/io/reader/query.rs +++ b/noodles-bcf/src/async/io/reader/query.rs @@ -96,10 +96,11 @@ fn intersects( chromosome_id: usize, region_interval: Interval, ) -> io::Result { - let id = record.chromosome_id(); + let id = record.chromosome_id()?; - let start = Position::try_from(usize::from(record.position())) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + let start = record.position().map(usize::from).and_then(|n| { + Position::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) + })?; let end = record.end().map(usize::from).and_then(|n| { Position::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) diff --git a/noodles-bcf/src/async/io/reader/record.rs b/noodles-bcf/src/async/io/reader/record.rs index 4f5fd5264..249dffebb 100644 --- a/noodles-bcf/src/async/io/reader/record.rs +++ b/noodles-bcf/src/async/io/reader/record.rs @@ -1,11 +1,7 @@ use crate::Record; use tokio::io::{self, AsyncRead, AsyncReadExt}; -pub(super) async fn read_record( - reader: &mut R, - buf: &mut Vec, - record: &mut Record, -) -> io::Result +pub(super) async fn read_record(reader: &mut R, record: &mut Record) -> io::Result where R: AsyncRead + Unpin, { @@ -21,14 +17,19 @@ where usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) })?; - buf.resize(l_shared, Default::default()); - reader.read_exact(buf).await?; + let site_buf = record.fields_mut().site_buf_mut(); + site_buf.resize(l_shared, 0); + reader.read_exact(site_buf).await?; + + let buf = site_buf.clone(); let mut buf_reader = &buf[..]; let (n_fmt, n_sample) = read_site(&mut buf_reader, record)?; - let genotypes = record.genotypes.as_mut(); - genotypes.resize(l_indiv, Default::default()); - reader.read_exact(genotypes).await?; + let samples_buf = record.fields_mut().samples_buf_mut(); + samples_buf.resize(l_indiv, 0); + reader.read_exact(samples_buf).await?; + + *record.genotypes.as_mut() = samples_buf.clone(); record.genotypes.set_format_count(n_fmt); record.genotypes.set_sample_count(n_sample); @@ -56,14 +57,13 @@ mod tests { let string_maps: StringMaps = RAW_HEADER.parse()?; let mut reader = &DATA[..]; - let mut buf = Vec::new(); let mut record = Record::default(); - read_record(&mut reader, &mut buf, &mut record).await?; + read_record(&mut reader, &mut record).await?; - assert_eq!(record.chromosome_id(), 1); - assert_eq!(record.position(), Position::from(101)); - assert_eq!(record.rlen(), 1); - assert_eq!(record.quality_score(), Some(30.1)); + assert_eq!(record.chromosome_id()?, 1); + assert_eq!(record.position()?, Position::from(101)); + assert_eq!(record.rlen()?, 1); + assert_eq!(record.quality_score()?, Some(30.1)); assert_eq!(record.ids(), &"rs123".parse::()?); assert_eq!(record.reference_bases(), "A"); assert_eq!( diff --git a/noodles-bcf/src/io/reader.rs b/noodles-bcf/src/io/reader.rs index 6351bf95d..a29e9b1c9 100644 --- a/noodles-bcf/src/io/reader.rs +++ b/noodles-bcf/src/io/reader.rs @@ -174,7 +174,7 @@ where /// # Ok::<(), io::Error>(()) /// ``` pub fn read_record(&mut self, record: &mut Record) -> io::Result { - read_record(&mut self.inner, &mut self.buf, record) + read_record(&mut self.inner, record) } /// Returns an iterator over records starting from the current stream position. diff --git a/noodles-bcf/src/io/reader/record.rs b/noodles-bcf/src/io/reader/record.rs index 03b7ff850..fcfd6a3f9 100644 --- a/noodles-bcf/src/io/reader/record.rs +++ b/noodles-bcf/src/io/reader/record.rs @@ -9,7 +9,7 @@ use crate::{ Record, }; -pub fn read_record(reader: &mut R, buf: &mut Vec, record: &mut Record) -> io::Result +pub fn read_record(reader: &mut R, record: &mut Record) -> io::Result where R: Read, { @@ -23,14 +23,19 @@ where usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) })?; - buf.resize(l_shared, Default::default()); - reader.read_exact(buf)?; + let site_buf = record.fields_mut().site_buf_mut(); + site_buf.resize(l_shared, 0); + reader.read_exact(site_buf)?; + + let buf = site_buf.clone(); let mut buf_reader = &buf[..]; let (n_fmt, n_sample) = read_site(&mut buf_reader, record)?; - let genotypes = record.genotypes.as_mut(); - genotypes.resize(l_indiv, Default::default()); - reader.read_exact(genotypes)?; + let samples_buf = record.fields_mut().samples_buf_mut(); + samples_buf.resize(l_indiv, 0); + reader.read_exact(samples_buf)?; + + *record.genotypes.as_mut() = samples_buf.clone(); record.genotypes.set_format_count(n_fmt); record.genotypes.set_sample_count(n_sample); @@ -158,14 +163,13 @@ pub(crate) mod tests { let string_maps: StringMaps = RAW_HEADER.parse()?; let mut reader = &DATA[..]; - let mut buf = Vec::new(); let mut record = Record::default(); - read_record(&mut reader, &mut buf, &mut record)?; + read_record(&mut reader, &mut record)?; - assert_eq!(record.chromosome_id(), 1); - assert_eq!(record.position(), Position::from(101)); - assert_eq!(record.rlen(), 1); - assert_eq!(record.quality_score(), Some(30.1)); + assert_eq!(record.chromosome_id()?, 1); + assert_eq!(record.position()?, Position::from(101)); + assert_eq!(record.rlen()?, 1); + assert_eq!(record.quality_score()?, Some(30.1)); assert_eq!(record.ids(), &"rs123".parse::()?); assert_eq!(record.reference_bases(), "A"); assert_eq!( diff --git a/noodles-bcf/src/record.rs b/noodles-bcf/src/record.rs index 1ed396508..fe1496f52 100644 --- a/noodles-bcf/src/record.rs +++ b/noodles-bcf/src/record.rs @@ -2,6 +2,7 @@ pub(crate) mod codec; mod convert; +mod fields; mod filters; mod genotypes; mod info; @@ -11,6 +12,7 @@ use std::io; use noodles_vcf as vcf; +use self::fields::Fields; pub(crate) use self::value::Value; pub use self::{filters::Filters, genotypes::Genotypes, info::Info}; @@ -20,6 +22,7 @@ pub type ChromosomeId = usize; /// A BCF record. #[derive(Clone, Debug, PartialEq)] pub struct Record { + fields: Fields, pub(crate) chrom: ChromosomeId, pub(crate) pos: vcf::record::Position, pub(crate) rlen: usize, @@ -33,6 +36,10 @@ pub struct Record { } impl Record { + pub(crate) fn fields_mut(&mut self) -> &mut Fields { + &mut self.fields + } + /// Returns the chromosome ID of the record. /// /// The chromosome ID represents an index in the contig string map, which associates an ID (by @@ -45,10 +52,12 @@ impl Record { /// ``` /// use noodles_bcf as bcf; /// let record = bcf::Record::default(); - /// assert_eq!(record.chromosome_id(), 0); + /// assert_eq!(record.chromosome_id()?, 0); + /// # Ok::<_, std::io::Error>(()) /// ``` - pub fn chromosome_id(&self) -> ChromosomeId { - self.chrom + pub fn chromosome_id(&self) -> io::Result { + let n = self.fields.reference_sequence_id(); + usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) } /// Returns the start position of this record. @@ -61,14 +70,21 @@ impl Record { /// ``` /// use noodles_bcf as bcf; /// let record = bcf::Record::default(); - /// assert_eq!(usize::from(record.position()), 1); + /// assert_eq!(record.position().map(usize::from)?, 1); + /// # Ok::<_, std::io::Error>(()) /// ``` - pub fn position(&self) -> vcf::record::Position { - self.pos + pub fn position(&self) -> io::Result { + let n = self.fields.position(); + + usize::try_from(n) + .map(|m| m + 1) + .map(vcf::record::Position::from) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) } - pub(crate) fn rlen(&self) -> usize { - self.rlen + pub(crate) fn rlen(&self) -> io::Result { + let n = self.fields.span(); + usize::try_from(n).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) } /// Returns the end position of this record. @@ -87,8 +103,8 @@ impl Record { pub fn end(&self) -> io::Result { use vcf::record::Position; - let start = usize::from(self.position()); - let len = self.rlen(); + let start = self.position().map(usize::from)?; + let len = self.rlen()?; let end = start + len - 1; Ok(Position::from(end)) @@ -101,10 +117,11 @@ impl Record { /// ``` /// use noodles_bcf as bcf; /// let record = bcf::Record::default(); - /// assert!(record.quality_score().is_none()); + /// assert!(record.quality_score()?.is_none()); + /// # Ok::<_, std::io::Error>(()) /// ``` - pub fn quality_score(&self) -> Option { - self.qual + pub fn quality_score(&self) -> io::Result> { + self.fields.quality_score() } /// Returns the IDs. @@ -171,6 +188,7 @@ impl Record { impl Default for Record { fn default() -> Self { Self { + fields: Fields::default(), chrom: 0, pos: vcf::record::Position::from(1), rlen: 1, diff --git a/noodles-bcf/src/record/convert.rs b/noodles-bcf/src/record/convert.rs index 3b2ad3054..1045ce4db 100644 --- a/noodles-bcf/src/record/convert.rs +++ b/noodles-bcf/src/record/convert.rs @@ -37,7 +37,7 @@ impl Record { ) -> io::Result { let chromosome = string_maps .contigs() - .get_index(self.chromosome_id()) + .get_index(self.chromosome_id()?) .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidInput, "invalid chrom"))?; let filters = self @@ -54,14 +54,14 @@ impl Record { let mut builder = vcf::Record::builder() .set_chromosome(chromosome) - .set_position(self.position()) + .set_position(self.position()?) .set_ids(self.ids().clone()) .set_reference_bases(self.reference_bases()) .set_alternate_bases(self.alternate_bases().clone()) .set_info(info) .set_genotypes(genotypes); - if let Some(quality_score) = self.quality_score() { + if let Some(quality_score) = self.quality_score()? { builder = builder.set_quality_score(quality_score); } diff --git a/noodles-bcf/src/record/fields.rs b/noodles-bcf/src/record/fields.rs new file mode 100644 index 000000000..bf774a737 --- /dev/null +++ b/noodles-bcf/src/record/fields.rs @@ -0,0 +1,80 @@ +mod bounds; + +use std::io; + +use self::bounds::Bounds; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(crate) struct Fields { + site_buf: Vec, + samples_buf: Vec, + bounds: Bounds, +} + +impl Fields { + pub(crate) fn site_buf_mut(&mut self) -> &mut Vec { + &mut self.site_buf + } + + pub(crate) fn samples_buf_mut(&mut self) -> &mut Vec { + &mut self.samples_buf + } + + pub(super) fn reference_sequence_id(&self) -> i32 { + let src = &self.site_buf[bounds::REFERENCE_SEQUENCE_ID_RANGE]; + // SAFETY: `src` is 4 bytes. + i32::from_le_bytes(src.try_into().unwrap()) + } + + // N.B. this is 0-based. + pub(super) fn position(&self) -> i32 { + let src = &self.site_buf[bounds::POSITION_RANGE]; + // SAFETY: `src` is 4 bytes. + i32::from_le_bytes(src.try_into().unwrap()) + } + + pub(super) fn span(&self) -> i32 { + let src = &self.site_buf[bounds::SPAN_RANGE]; + // SAFETY: `src` is 4 bytes. + i32::from_le_bytes(src.try_into().unwrap()) + } + + pub(super) fn quality_score(&self) -> io::Result> { + use crate::record::codec::value::Float; + + let src = &self.site_buf[bounds::QUALITY_SCORE_RANGE]; + // SAFETY: `src` is 4 bytes. + let n = f32::from_le_bytes(src.try_into().unwrap()); + + match Float::from(n) { + Float::Value(n) => Ok(Some(n)), + Float::Missing => Ok(None), + _ => Err(io::Error::new( + io::ErrorKind::InvalidData, + "invalid quality score", + )), + } + } +} + +impl Default for Fields { + fn default() -> Self { + Self { + site_buf: vec![ + 0x00, 0x00, 0x00, 0x00, // chrom = 0 + 0x00, 0x00, 0x00, 0x00, // pos = 0 (0-based) + 0x01, 0x00, 0x00, 0x00, // rlen = 1 + 0x01, 0x00, 0x80, 0x7f, // qual = None + 0x00, 0x00, // n_info = 0 + 0x01, 0x00, // n_allele = 1 + 0x00, 0x00, 0x00, // n_sample = 0 + 0x00, // n_fmt = 0 + 0x07, // ids = [] + 0x17, 0x4e, // ref = N + 0x00, // filters = [] + ], + samples_buf: Vec::new(), + bounds: Bounds, + } + } +} diff --git a/noodles-bcf/src/record/fields/bounds.rs b/noodles-bcf/src/record/fields/bounds.rs new file mode 100644 index 000000000..9f839118d --- /dev/null +++ b/noodles-bcf/src/record/fields/bounds.rs @@ -0,0 +1,9 @@ +use std::ops::Range; + +pub(super) const REFERENCE_SEQUENCE_ID_RANGE: Range = 0..4; +pub(super) const POSITION_RANGE: Range = 4..8; +pub(super) const SPAN_RANGE: Range = 8..12; +pub(super) const QUALITY_SCORE_RANGE: Range = 12..16; + +#[derive(Clone, Debug, Eq, PartialEq)] +pub(super) struct Bounds;