diff --git a/bin/test.sh b/bin/test.sh index 483c3e1..6cf58f9 100644 --- a/bin/test.sh +++ b/bin/test.sh @@ -24,6 +24,12 @@ function teardown { } # Build the code + +# uninstall biobear if it's installed +if pip show biobear; then + pip uninstall -y biobear +fi + cargo build maturin develop diff --git a/src/datasources/fasta.rs b/src/datasources/fasta.rs index fe25e6e..098d6f4 100644 --- a/src/datasources/fasta.rs +++ b/src/datasources/fasta.rs @@ -12,9 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::{path::Path, str::FromStr}; - -use crate::{error::BioBearResult, file_compression_type::FileCompressionType}; +use crate::{error::BioBearResult, file_compression_type::FileCompressionType, file_options::FileOptions}; use exon::datasources::fasta::{table_provider::ListingFASTATableOptions, SequenceDataType}; use pyo3::{pyclass, pymethods}; @@ -43,99 +41,6 @@ impl From for SequenceDataType { } } -#[derive(Debug, Clone, Default)] -pub struct FASTAReadOptionsBuilder { - file_extension: Option, - file_compression_type: Option, - fasta_sequence_data_type: Option, -} - -impl FASTAReadOptionsBuilder { - pub fn new() -> Self { - Self { - file_extension: None, - file_compression_type: None, - fasta_sequence_data_type: None, - } - } - - pub fn merge(mut self, other: FASTAReadOptions) -> Self { - if other.file_extension.is_some() { - self.file_extension = other.file_extension; - } - - if other.file_compression_type.is_some() { - self.file_compression_type = other.file_compression_type; - } - - if other.fasta_sequence_data_type.is_some() { - self.fasta_sequence_data_type = other.fasta_sequence_data_type; - } - - self - } - - pub fn from_path(file_path: &str) -> Self { - let path = Path::new(file_path); - - let extension = if let Some(extension) = path.extension().and_then(|ext| ext.to_str()) { - extension - } else { - return Self::new(); - }; - - if let Ok(file_compression_type) = FileCompressionType::from_str(extension) { - // we got a file compression type, so now check the stem and its extension - if let Some(stem) = path.file_stem().and_then(|stem| stem.to_str()) { - let stem = Path::new(stem); - if let Some(file_extension) = stem.extension().and_then(|ext| ext.to_str()) { - return Self::new() - .with_file_extension(file_extension) - .with_file_compression_type(file_compression_type); - } else { - return Self::new().with_file_compression_type(file_compression_type); - }; - } else { - return Self::new().with_file_compression_type(file_compression_type); - }; - } - - Self { - file_extension: Some(extension.to_string()), - file_compression_type: None, - fasta_sequence_data_type: None, - } - } - - pub fn with_file_extension(mut self, file_extension: &str) -> Self { - self.file_extension = Some(file_extension.to_string()); - self - } - - pub fn with_file_compression_type( - mut self, - file_compression_type: FileCompressionType, - ) -> Self { - self.file_compression_type = Some(file_compression_type); - self - } - - pub fn with_fasta_sequence_data_type( - mut self, - fasta_sequence_data_type: FastaSequenceDataType, - ) -> Self { - self.fasta_sequence_data_type = Some(fasta_sequence_data_type); - self - } - - pub fn build(self) -> FASTAReadOptions { - FASTAReadOptions { - file_extension: self.file_extension, - file_compression_type: self.file_compression_type, - fasta_sequence_data_type: self.fasta_sequence_data_type, - } - } -} #[pyclass] #[derive(Debug, Clone)] @@ -202,8 +107,17 @@ impl FASTAReadOptions { } impl FASTAReadOptions { - pub fn builder() -> FASTAReadOptionsBuilder { - FASTAReadOptionsBuilder::new() + pub(crate) fn update_from_file_options(&mut self, file_options: &FileOptions) -> BioBearResult<()> { + if let Some(file_extension) = file_options.file_extension() { + self.file_extension = Some(file_extension.to_string()); + } + + if let Some(file_compression_type) = file_options.file_compression_type() { + let fct = FileCompressionType::try_from(file_compression_type)?; + self.file_compression_type = Some(fct); + } + + Ok(()) } } diff --git a/src/file_options.rs b/src/file_options.rs new file mode 100644 index 0000000..8a3152d --- /dev/null +++ b/src/file_options.rs @@ -0,0 +1,86 @@ +// Copyright 2024 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::{path::Path, str::FromStr}; + +use datafusion::datasource::file_format::file_compression_type::FileCompressionType; + +#[derive(Debug, Clone, Default)] +pub(crate) struct FileOptions { + file_extension: Option, + file_compression_type: Option, +} + +impl FileOptions { + pub fn file_extension(&self) -> Option<&str> { + self.file_extension.as_deref() + } + + pub fn file_compression_type(&self) -> Option { + self.file_compression_type + } + +} + +impl From<&str> for FileOptions { + fn from(s: &str) -> Self { + let path = Path::new(s); + + let extension = match path.extension().and_then(|ext| ext.to_str()) { + Some(ext) => ext, + None => return Self::default(), + }; + + if let Ok(file_compression_type) = FileCompressionType::from_str(extension) { + if let Some(stem) = path.file_stem().and_then(|stem| stem.to_str()) { + let file_extension = Path::new(stem).extension().and_then(|ext| ext.to_str()); + return Self { + file_extension: file_extension.map(|ext| ext.to_string()), + file_compression_type: Some(file_compression_type), + }; + } + return Self { + file_extension: None, + file_compression_type: Some(file_compression_type), + } + } + + Self { + file_extension: Some(extension.to_string()), + file_compression_type: None, + } + + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_from_str() { + let file_options = FileOptions::from("test.csv"); + assert_eq!(file_options.file_extension(), Some("csv")); + assert_eq!(file_options.file_compression_type(), None); + + let file_options = FileOptions::from("test.csv.gz"); + assert_eq!(file_options.file_extension(), Some("csv")); + assert_eq!(file_options.file_compression_type(), Some(FileCompressionType::GZIP)); + + let file_options = FileOptions::from("test"); + assert_eq!(file_options.file_extension, None); + assert_eq!(file_options.file_compression_type, None); + } +} diff --git a/src/lib.rs b/src/lib.rs index f937ea9..99d45cd 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -21,6 +21,7 @@ mod exon_reader; mod vcf_reader; mod file_compression_type; +mod file_options; pub use file_compression_type::FileCompressionType; diff --git a/src/session_context.rs b/src/session_context.rs index d361455..2e7f5f0 100644 --- a/src/session_context.rs +++ b/src/session_context.rs @@ -19,12 +19,13 @@ use pyo3::prelude::*; use crate::datasources::bcf::BCFReadOptions; use crate::datasources::bigwig::BigWigReadOptions; -use crate::datasources::fasta::{FASTAReadOptions, FASTAReadOptionsBuilder}; +use crate::datasources::fasta::FASTAReadOptions; use crate::datasources::fastq::FASTQReadOptions; use crate::datasources::hmm_dom_tab::HMMDomTabReadOptions; use crate::datasources::mzml::MzMLReadOptions; use crate::error; use crate::execution_result::ExecutionResult; +use crate::file_options::FileOptions; use crate::runtime::wait_for_future; #[pyclass] @@ -217,13 +218,9 @@ impl BioBearSessionContext { options: Option, py: Python, ) -> PyResult { - let options = if let Some(options) = options { - let fasta_read_options_builder = FASTAReadOptionsBuilder::from_path(file_path); - - fasta_read_options_builder.merge(options).build() - } else { - FASTAReadOptionsBuilder::from_path(file_path).build() - }; + let file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + options.update_from_file_options(&file_options)?; let result = self.ctx.read_fasta(file_path, options.into()); let df = wait_for_future(py, result).map_err(error::BioBearError::from)?;