From bb20528c63156238519aafeb0a7ed15214c21028 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Wed, 31 Jul 2024 18:52:51 -0700 Subject: [PATCH] feat: finish transition to file options (#166) * refactor: fill out file inference * feat: finish transition to file options --- Cargo.toml | 2 +- src/datasources/bed.rs | 26 ++------------ src/datasources/fasta.rs | 24 ++----------- src/datasources/fastq.rs | 22 ++---------- src/datasources/genbank.rs | 17 ++++++--- src/datasources/gff.rs | 11 ++++-- src/datasources/gtf.rs | 21 +++++++---- src/datasources/mzml.rs | 30 ++++++++++++---- src/datasources/vcf.rs | 18 ++++++---- src/file_compression_type.rs | 2 +- src/file_options.rs | 25 +++++++------ .../settable_from_file_options.rs | 16 +++++++++ src/session_context.rs | 36 ++++++++++++++----- 13 files changed, 139 insertions(+), 111 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 0b44546..4d116b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,7 @@ name = "biobear" [dependencies] arrow = { version = "52.1.0", features = ["pyarrow"] } datafusion = "40" -exon = { version = "0.29.1", features = ["default"] } +exon = { version = "0.30.0", features = ["default"] } pyo3 = "0.21.2" tokio = { version = "1", features = ["rt"] } noodles = { version = "0.78", features = ["core"] } diff --git a/src/datasources/bed.rs b/src/datasources/bed.rs index 3b1fe40..a886877 100644 --- a/src/datasources/bed.rs +++ b/src/datasources/bed.rs @@ -15,7 +15,7 @@ use exon::datasources::bed::table_provider::ListingBEDTableOptions; use pyo3::{pyclass, pymethods}; -use crate::{error::BioBearResult, file_options::FileOptions, FileCompressionType}; +use crate::{file_options::impl_settable_from_file_options, FileCompressionType}; #[pyclass] #[derive(Debug, Clone, Default)] @@ -31,6 +31,8 @@ pub struct BEDReadOptions { file_extension: Option, } +impl_settable_from_file_options!(BEDReadOptions); + #[pymethods] impl BEDReadOptions { #[new] @@ -48,28 +50,6 @@ impl BEDReadOptions { } } -impl BEDReadOptions { - pub(crate) fn update_from_file_options( - &mut self, - file_options: &FileOptions, - ) -> BioBearResult<()> { - if let Some(file_extension) = file_options.file_extension() { - if self.file_extension.is_none() { - self.file_extension = Some(file_extension.to_string()); - } - } - - if let Some(file_compression_type) = file_options.file_compression_type() { - if self.file_compression_type.is_none() { - let fct = FileCompressionType::try_from(file_compression_type)?; - self.file_compression_type = Some(fct); - } - } - - Ok(()) - } -} - impl From for ListingBEDTableOptions { fn from(options: BEDReadOptions) -> Self { let file_compression_type = options diff --git a/src/datasources/fasta.rs b/src/datasources/fasta.rs index 0bc18e9..eb3964c 100644 --- a/src/datasources/fasta.rs +++ b/src/datasources/fasta.rs @@ -13,7 +13,7 @@ // limitations under the License. use crate::{ - error::BioBearResult, file_compression_type::FileCompressionType, file_options::FileOptions, + file_compression_type::FileCompressionType, file_options::impl_settable_from_file_options, }; use exon::datasources::fasta::{table_provider::ListingFASTATableOptions, SequenceDataType}; use pyo3::{pyclass, pymethods}; @@ -107,27 +107,7 @@ impl FASTAReadOptions { } } -impl FASTAReadOptions { - pub(crate) fn update_from_file_options( - &mut self, - file_options: &FileOptions, - ) -> BioBearResult<()> { - if let Some(file_extension) = file_options.file_extension() { - if self.file_extension.is_none() { - self.file_extension = Some(file_extension.to_string()); - } - } - - if let Some(file_compression_type) = file_options.file_compression_type() { - if self.file_compression_type.is_none() { - let fct = FileCompressionType::try_from(file_compression_type)?; - self.file_compression_type = Some(fct); - } - } - - Ok(()) - } -} +impl_settable_from_file_options!(FASTAReadOptions); impl From for ListingFASTATableOptions { fn from(options: FASTAReadOptions) -> Self { diff --git a/src/datasources/fastq.rs b/src/datasources/fastq.rs index bf4292b..1401bd4 100644 --- a/src/datasources/fastq.rs +++ b/src/datasources/fastq.rs @@ -13,7 +13,7 @@ // limitations under the License. use crate::{ - error::BioBearResult, file_compression_type::FileCompressionType, file_options::FileOptions, + file_compression_type::FileCompressionType, file_options::impl_settable_from_file_options, }; use exon::datasources::fastq::table_provider::ListingFASTQTableOptions; use pyo3::{pyclass, pymethods}; @@ -51,6 +51,8 @@ pub struct FASTQReadOptions { file_compression_type: Option, } +impl_settable_from_file_options!(FASTQReadOptions); + #[pymethods] impl FASTQReadOptions { #[new] @@ -84,24 +86,6 @@ impl FASTQReadOptions { } } -impl FASTQReadOptions { - pub(crate) fn update_from_file_options( - &mut self, - file_options: &FileOptions, - ) -> BioBearResult<()> { - if let Some(file_extension) = file_options.file_extension() { - self.file_extension = Some(file_extension.to_string()); - } - - if let Some(file_compression_type) = file_options.file_compression_type() { - let fct = FileCompressionType::try_from(file_compression_type)?; - self.file_compression_type = Some(fct); - } - - Ok(()) - } -} - impl From for ListingFASTQTableOptions { fn from(options: FASTQReadOptions) -> Self { let file_compression_type = options diff --git a/src/datasources/genbank.rs b/src/datasources/genbank.rs index 69be665..13715c9 100644 --- a/src/datasources/genbank.rs +++ b/src/datasources/genbank.rs @@ -15,28 +15,37 @@ use exon::datasources::genbank::table_provider::ListingGenbankTableOptions; use pyo3::{pyclass, pymethods}; -use crate::FileCompressionType; +use crate::{file_options::impl_settable_from_file_options, FileCompressionType}; #[pyclass] #[derive(Debug, Clone, Default)] /// Options for reading GenBank files. pub struct GenBankReadOptions { /// The file compression type. - file_compression_type: FileCompressionType, + file_compression_type: Option, + /// The file extension. + file_extension: Option, } +impl_settable_from_file_options!(GenBankReadOptions); + #[pymethods] impl GenBankReadOptions { #[new] fn new(file_compression_type: Option) -> Self { Self { - file_compression_type: file_compression_type.unwrap_or_default(), + file_compression_type, + file_extension: Some("gb".to_string()), } } } impl From for ListingGenbankTableOptions { fn from(options: GenBankReadOptions) -> Self { - ListingGenbankTableOptions::new(options.file_compression_type.into()) + let c = options + .file_compression_type + .unwrap_or(FileCompressionType::UNCOMPRESSED); + + ListingGenbankTableOptions::new(c.into()) } } diff --git a/src/datasources/gff.rs b/src/datasources/gff.rs index df5de45..4dcbea5 100644 --- a/src/datasources/gff.rs +++ b/src/datasources/gff.rs @@ -16,7 +16,11 @@ use exon::datasources::gff::table_provider::ListingGFFTableOptions; use noodles::core::Region; use pyo3::{pyclass, pymethods, PyResult}; -use crate::{error::BioBearResult, file_options::FileOptions, FileCompressionType}; +use crate::{ + error::BioBearResult, + file_options::{impl_settable_from_file_options, FileOptions}, + FileCompressionType, +}; use super::parse_region; @@ -28,6 +32,8 @@ pub struct GFFReadOptions { file_compression_type: Option, } +impl_settable_from_file_options!(GFFReadOptions); + #[pymethods] impl GFFReadOptions { #[new] @@ -56,8 +62,7 @@ impl GFFReadOptions { if let Some(file_compression_type) = options.file_compression_type() { if self.file_compression_type.is_none() { - let fct = FileCompressionType::try_from(file_compression_type)?; - self.file_compression_type = Some(fct); + self.file_compression_type = Some(file_compression_type); } } diff --git a/src/datasources/gtf.rs b/src/datasources/gtf.rs index 778adbd..44cd83a 100644 --- a/src/datasources/gtf.rs +++ b/src/datasources/gtf.rs @@ -15,35 +15,44 @@ use exon::datasources::gtf::table_provider::ListingGTFTableOptions; use pyo3::{pyclass, pymethods}; -use crate::FileCompressionType; +use crate::{file_options::impl_settable_from_file_options, FileCompressionType}; #[pyclass] #[derive(Debug, Clone)] pub struct GTFReadOptions { - file_compression_type: FileCompressionType, + file_compression_type: Option, + file_extension: Option, } impl Default for GTFReadOptions { fn default() -> Self { Self { - file_compression_type: FileCompressionType::UNCOMPRESSED, + file_compression_type: Some(FileCompressionType::UNCOMPRESSED), + file_extension: None, } } } +impl_settable_from_file_options!(GTFReadOptions); + #[pymethods] impl GTFReadOptions { #[new] pub fn new(file_compression_type: Option) -> Self { Self { - file_compression_type: file_compression_type - .unwrap_or(FileCompressionType::UNCOMPRESSED), + file_compression_type, + file_extension: Some("gtf".to_string()), } } } impl From for ListingGTFTableOptions { fn from(options: GTFReadOptions) -> Self { - ListingGTFTableOptions::new(options.file_compression_type.into()) + ListingGTFTableOptions::new( + options + .file_compression_type + .map(|c| c.into()) + .unwrap_or(datafusion::datasource::file_format::file_compression_type::FileCompressionType::UNCOMPRESSED), + ) } } diff --git a/src/datasources/mzml.rs b/src/datasources/mzml.rs index dac65e7..4ebb742 100644 --- a/src/datasources/mzml.rs +++ b/src/datasources/mzml.rs @@ -15,19 +15,23 @@ use exon::datasources::mzml::table_provider::ListingMzMLTableOptions; use pyo3::{pyclass, pymethods}; -use crate::FileCompressionType; +use crate::{file_options::impl_settable_from_file_options, FileCompressionType}; #[pyclass] #[derive(Debug, Clone)] /// Options for reading mzML files. pub struct MzMLReadOptions { - file_compression_type: FileCompressionType, + file_compression_type: Option, + file_extension: Option, } +impl_settable_from_file_options!(MzMLReadOptions); + impl Default for MzMLReadOptions { fn default() -> Self { Self { - file_compression_type: FileCompressionType::UNCOMPRESSED, + file_compression_type: Some(FileCompressionType::UNCOMPRESSED), + file_extension: None, } } } @@ -37,14 +41,28 @@ impl MzMLReadOptions { #[new] fn new(file_compression_type: Option) -> Self { Self { - file_compression_type: file_compression_type - .unwrap_or(FileCompressionType::UNCOMPRESSED), + file_compression_type: Some( + file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED), + ), + file_extension: None, } } } impl From for ListingMzMLTableOptions { fn from(options: MzMLReadOptions) -> Self { - ListingMzMLTableOptions::new(options.file_compression_type.into()) + let file_compression_type = options + .file_compression_type + .unwrap_or(FileCompressionType::UNCOMPRESSED); + + let mut new_options = ListingMzMLTableOptions::new(file_compression_type.into()); + + // let file_extension = options.file_extension; + if let Some(fe) = options.file_extension { + eprintln!("Setting file extension to {}", fe); + new_options = new_options.with_file_extension(fe) + } + + new_options } } diff --git a/src/datasources/vcf.rs b/src/datasources/vcf.rs index 06c8b70..9f7dfe2 100644 --- a/src/datasources/vcf.rs +++ b/src/datasources/vcf.rs @@ -17,7 +17,7 @@ use exon::datasources::vcf::ListingVCFTableOptions; use noodles::core::Region; use pyo3::{pyclass, pymethods, PyResult}; -use crate::FileCompressionType; +use crate::{file_options::impl_settable_from_file_options, FileCompressionType}; use super::parse_region; @@ -28,15 +28,19 @@ pub struct VCFReadOptions { /// The region to read. region: Option, /// The file compression type. - file_compression_type: FileCompressionType, + file_compression_type: Option, /// True if the INFO column should be parsed. parse_info: bool, /// True if the FORMAT column should be parsed. parse_formats: bool, /// The partition fields. partition_cols: Option>, + /// The file extension. + file_extension: Option, } +impl_settable_from_file_options!(VCFReadOptions); + #[pymethods] impl VCFReadOptions { #[new] @@ -50,22 +54,24 @@ impl VCFReadOptions { ) -> PyResult { let region = parse_region(region)?; - let file_compression_type = - file_compression_type.unwrap_or(FileCompressionType::UNCOMPRESSED); - Ok(Self { region, file_compression_type, parse_info, parse_formats, partition_cols, + file_extension: Some("vcf".to_string()), }) } } impl From for ListingVCFTableOptions { fn from(options: VCFReadOptions) -> Self { - let mut o = ListingVCFTableOptions::new(options.file_compression_type.into(), false) + let compression = options + .file_compression_type + .unwrap_or(FileCompressionType::UNCOMPRESSED); + + let mut o = ListingVCFTableOptions::new(compression.into(), false) .with_parse_info(options.parse_info) .with_parse_formats(options.parse_formats); diff --git a/src/file_compression_type.rs b/src/file_compression_type.rs index 0012fef..a0f43c2 100644 --- a/src/file_compression_type.rs +++ b/src/file_compression_type.rs @@ -23,7 +23,7 @@ use pyo3::prelude::*; use crate::error::BioBearError; #[pyclass] -#[derive(Debug, Clone)] +#[derive(Debug, Clone, PartialEq)] pub enum FileCompressionType { GZIP, ZSTD, diff --git a/src/file_options.rs b/src/file_options.rs index c367a04..ef83d52 100644 --- a/src/file_options.rs +++ b/src/file_options.rs @@ -12,19 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::{path::Path, str::FromStr}; - -use datafusion::datasource::file_format::file_compression_type::FileCompressionType; +use std::path::Path; +use std::str::FromStr; use crate::error::BioBearResult; +use crate::FileCompressionType; mod settable_from_file_options; +pub(crate) use settable_from_file_options::impl_settable_from_file_options; pub(crate) use settable_from_file_options::SettableFromFileOptions; #[derive(Debug, Clone, Default)] pub(crate) struct FileOptions { file_extension: Option, - file_compression_type: Option, + file_compression_type: Option, } impl FileOptions { @@ -32,22 +33,22 @@ impl FileOptions { self.file_extension.as_deref() } - pub fn file_compression_type(&self) -> Option { - self.file_compression_type + pub fn file_compression_type(&self) -> Option { + self.file_compression_type.clone() } pub fn set_from_file_options( - &mut self, + &self, settable: &mut dyn settable_from_file_options::SettableFromFileOptions, ) -> BioBearResult<()> { - if let Some(file_extension) = self.file_extension() { - let file_options = settable.file_extension_mut(); - *file_options = Some(file_extension.to_string()); + if settable.file_extension_mut().is_none() { + let file_extension_option = settable.file_extension_mut(); + *file_extension_option = self.file_extension().map(|ext| ext.to_string()); } if let Some(file_compression_type) = self.file_compression_type() { let file_options = settable.file_compression_type_mut(); - *file_options = Some(crate::FileCompressionType::try_from(file_compression_type)?); + *file_options = Some(file_compression_type.clone()); } Ok(()) @@ -86,6 +87,8 @@ impl From<&str> for FileOptions { #[cfg(test)] mod tests { + use crate::FileCompressionType; + use super::*; #[test] diff --git a/src/file_options/settable_from_file_options.rs b/src/file_options/settable_from_file_options.rs index e057eb7..9f2ae84 100644 --- a/src/file_options/settable_from_file_options.rs +++ b/src/file_options/settable_from_file_options.rs @@ -19,3 +19,19 @@ pub(crate) trait SettableFromFileOptions { fn file_compression_type_mut(&mut self) -> &mut Option; } + +macro_rules! impl_settable_from_file_options { + ($struct_name:ident) => { + impl crate::file_options::SettableFromFileOptions for $struct_name { + fn file_extension_mut(&mut self) -> &mut Option { + &mut self.file_extension + } + + fn file_compression_type_mut(&mut self) -> &mut Option { + &mut self.file_compression_type + } + } + }; +} + +pub(crate) use impl_settable_from_file_options; diff --git a/src/session_context.rs b/src/session_context.rs index a8288ae..443cb7a 100644 --- a/src/session_context.rs +++ b/src/session_context.rs @@ -56,7 +56,10 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let options = options.unwrap_or_default(); + let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + + file_options.set_from_file_options(&mut options)?; let result = self.ctx.read_vcf(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; @@ -71,7 +74,10 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let options = options.unwrap_or_default(); + let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + + file_options.set_from_file_options(&mut options)?; let result = self.ctx.read_hmm_dom_tab(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; @@ -86,7 +92,7 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let mut file_options = FileOptions::from(file_path); + let file_options = FileOptions::from(file_path); let mut options = options.unwrap_or_default(); file_options.set_from_file_options(&mut options)?; @@ -151,7 +157,8 @@ impl BioBearSessionContext { ) -> PyResult { let file_options = FileOptions::from(file_path); let mut options = options.unwrap_or_default(); - options.update_from_file_options(&file_options)?; + + file_options.set_from_file_options(&mut options)?; let result = self.ctx.read_fastq(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; @@ -166,7 +173,10 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let options = options.unwrap_or_default(); + let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + + file_options.set_from_file_options(&mut options)?; let result = self.ctx.read_genbank(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; @@ -196,7 +206,10 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let options = options.unwrap_or_default(); + let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + + file_options.set_from_file_options(&mut options)?; let result = self.ctx.read_mzml(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; @@ -211,7 +224,10 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let options = options.unwrap_or_default(); + let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + + file_options.set_from_file_options(&mut options)?; let result = self.ctx.read_gtf(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; @@ -242,8 +258,9 @@ impl BioBearSessionContext { py: Python, ) -> PyResult { let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); - options.update_from_file_options(&file_options)?; + file_options.set_from_file_options(&mut options)?; let result = self.ctx.read_fasta(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; @@ -260,7 +277,8 @@ impl BioBearSessionContext { ) -> PyResult { let file_options = FileOptions::from(file_path); let mut options = options.unwrap_or_default(); - options.update_from_file_options(&file_options)?; + + file_options.set_from_file_options(&mut options)?; let result = self.ctx.read_bed(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?;