diff --git a/Cargo.toml b/Cargo.toml index d6f4b3f..ced011e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,12 +8,12 @@ crate-type = ["cdylib"] name = "biobear" [dependencies] -arrow = { version = "52.0.0", features = ["pyarrow"] } -datafusion = "39" -exon = { version = "0.26", features = ["default"] } +arrow = { version = "52.1.0", features = ["pyarrow"] } +datafusion = "40" +exon = { version = "0.29", features = ["default"] } pyo3 = "0.21.2" tokio = { version = "1", features = ["rt"] } -noodles = { version = "0.77", features = ["core"] } +noodles = { version = "0.78", features = ["core"] } [profile.release] codegen-units = 1 diff --git a/python/biobear/__init__.py b/python/biobear/__init__.py index 5fb7b7f..ead4759 100644 --- a/python/biobear/__init__.py +++ b/python/biobear/__init__.py @@ -43,6 +43,7 @@ from .biobear import GenBankReadOptions from .biobear import FCSReadOptions from .biobear import CRAMReadOptions +from .biobear import SDFReadOptions from .biobear import connect from .biobear import new_session from .biobear import __runtime @@ -72,6 +73,7 @@ "BCFReadOptions", "VCFReadOptions", "BEDReadOptions", + "SDFReadOptions", "FCSReadOptions", "CRAMReadOptions", "BigWigReadOptions", diff --git a/python/biobear/biobear.pyi b/python/biobear/biobear.pyi index 2a29b0b..bf06cb3 100644 --- a/python/biobear/biobear.pyi +++ b/python/biobear/biobear.pyi @@ -116,7 +116,7 @@ class VCFReadOptions: file_compression_type: Optional[FileCompressionType] = None, parse_info: bool = False, parse_formats: bool = False, - partition_cols: list[str | None] = None, + partition_cols: list[str] | None = None, ) -> None: ... class BCFReadOptions: @@ -141,6 +141,16 @@ class BAMReadOptions: region: Optional[str] = None, ) -> None: ... +class SDFReadOptions: + """Options for reading SDF data.""" + def __init__( + self, + /, + file_compression_type: Optional[FileCompressionType] = None, + file_extension: Optional[str] = None, + partition_cols: list[str] | None = None, + ) -> None: ... + class BEDReadOptions: """Options for reading BED data.""" def __init__( @@ -187,6 +197,10 @@ class BioBearSessionContext: self, file_path: str, /, options: Optional[FASTQReadOptions] = None ) -> ExecutionResult: """Reads one or more FASTQ files and returns an ExecutionResult.""" + def read_sdf_file( + self, file_path: str, /, options: Optional[SDFReadOptions] = None + ) -> ExecutionResult: + """Reads one or more SDF files and returns an ExecutionResult.""" def read_fasta_file( self, file_path: str, /, options: Optional[FASTAReadOptions] = None ) -> ExecutionResult: diff --git a/python/tests/data/tox_benchmark_N6512.sdf b/python/tests/data/tox_benchmark_N6512.sdf new file mode 100644 index 0000000..05dc16f --- /dev/null +++ b/python/tests/data/tox_benchmark_N6512.sdf @@ -0,0 +1,339 @@ + + SciTegic02060916132D + + 50 60 0 0 0 0 999 V2000 + -5.2740 4.8598 0.0000 O 0 0 + -5.4300 3.6700 0.0000 C 0 0 + -6.8500 3.0700 0.0000 C 0 0 + -8.0800 4.0100 0.0000 C 0 0 + -9.5100 3.4300 0.0000 C 0 0 + -9.6700 1.7700 0.0000 C 0 0 + -8.3900 0.8900 0.0000 C 0 0 + -6.9800 1.4800 0.0000 C 0 0 + -5.7800 0.6200 0.0000 C 0 0 + -5.9171 -0.5721 0.0000 O 0 0 + -4.3800 1.2000 0.0000 C 0 0 + -4.2200 2.7300 0.0000 C 0 0 + -2.8300 3.3500 0.0000 C 0 0 + -1.5900 2.4100 0.0000 C 0 0 + -1.7800 0.8500 0.0000 C 0 0 + -3.1400 0.2700 0.0000 C 0 0 + -2.9600 -1.2600 0.0000 N 0 0 + -1.5000 -1.5300 0.0000 C 0 0 + -0.7400 -2.8600 0.0000 C 0 0 + -1.4800 -4.1900 0.0000 C 0 0 + -2.6797 -4.2171 0.0000 O 0 0 + -0.6900 -5.4700 0.0000 C 0 0 + -1.4800 -6.8500 0.0000 C 0 0 + -0.7400 -8.1800 0.0000 C 0 0 + 0.7900 -8.1800 0.0000 C 0 0 + 1.5300 -6.8500 0.0000 C 0 0 + 0.7900 -5.4700 0.0000 C 0 0 + 1.5300 -4.1500 0.0000 C 0 0 + 2.7300 -4.1519 0.0000 O 0 0 + 0.7900 -2.8200 0.0000 C 0 0 + 1.5300 -1.4800 0.0000 C 0 0 + 3.0000 -1.1900 0.0000 N 0 0 + 3.1500 0.3200 0.0000 C 0 0 + 4.3800 1.2600 0.0000 C 0 0 + 5.7900 0.6800 0.0000 C 0 0 + 5.9393 -0.5107 0.0000 O 0 0 + 7.0400 1.6100 0.0000 C 0 0 + 8.4400 1.0700 0.0000 C 0 0 + 9.6800 2.0400 0.0000 C 0 0 + 9.4000 3.5400 0.0000 C 0 0 + 8.0000 4.0900 0.0000 C 0 0 + 6.7800 3.1400 0.0000 C 0 0 + 5.3800 3.7200 0.0000 C 0 0 + 5.2120 4.9082 0.0000 O 0 0 + 4.1900 2.7700 0.0000 C 0 0 + 2.7300 3.3900 0.0000 C 0 0 + 1.5100 2.4400 0.0000 C 0 0 + 1.7700 0.8500 0.0000 C 0 0 + 0.7400 -0.2700 0.0000 C 0 0 + -0.7700 -0.2700 0.0000 C 0 0 + 1 2 2 0 + 2 3 1 0 + 3 4 2 0 + 4 5 1 0 + 5 6 2 0 + 6 7 1 0 + 7 8 2 0 + 8 3 1 0 + 8 9 1 0 + 9 10 2 0 + 9 11 1 0 + 11 12 2 0 + 12 2 1 0 + 12 13 1 0 + 13 14 2 0 + 14 15 1 0 + 15 16 2 0 + 16 11 1 0 + 16 17 1 0 + 17 18 1 0 + 18 19 2 0 + 19 20 1 0 + 20 21 2 0 + 20 22 1 0 + 22 23 2 0 + 23 24 1 0 + 24 25 2 0 + 25 26 1 0 + 26 27 2 0 + 27 22 1 0 + 27 28 1 0 + 28 29 2 0 + 28 30 1 0 + 30 19 1 0 + 30 31 2 0 + 31 32 1 0 + 32 33 1 0 + 33 34 2 0 + 34 35 1 0 + 35 36 2 0 + 35 37 1 0 + 37 38 2 0 + 38 39 1 0 + 39 40 2 0 + 40 41 1 0 + 41 42 2 0 + 42 37 1 0 + 42 43 1 0 + 43 44 2 0 + 43 45 1 0 + 45 34 1 0 + 45 46 2 0 + 46 47 1 0 + 47 48 2 0 + 48 33 1 0 + 48 49 1 0 + 49 31 1 0 + 49 50 2 0 + 50 15 1 0 + 50 18 1 0 +M END +> +O=C1c2ccccc2C(=O)c3c1ccc4c3[nH]c5c6C(=O)c7ccccc7C(=O)c6c8[nH]c9c%10C(=O)c%11ccccc%11C(=O)c%10ccc9c8c45 + +> +2475-33-4 + +> +VITIC + +> +0 + +> +. + +> +JUDSON, PN, COOKE, PA, DOERRER, NG, GREENE, N, HANZLIK, RP, HARDY, C, HARTMANN, A, HINCHLIFFE, D, HOLDER, J, MUELLER, L, STEGER-HARTMANN, T, ROTHFUSS, A, SMITH, M, THOMAS, K, VESSEY, JD AND ZEIGER E. +TOWARDS THE CREATION OF AN INTERNATIONAL TOXICOLOGY INFORMATION CENTRE. TOXICOLOGY 213(1-2):117-28, 2005 + +> +0 + +> +0 + +> +0 + +> +1 + +> +646.60212 + +> +CV3 + +$$$$ + + SciTegic02060916132D + + 11 10 0 0 0 0 999 V2000 + 0.2606 0.1503 0.0000 N 0 0 + 1.3000 0.7500 0.0000 N 0 0 + 2.6000 0.0000 0.0000 C 0 0 + 2.6000 -1.2000 0.0000 O 0 0 + 3.9000 0.7500 0.0000 C 0 0 + 5.2000 0.0000 0.0000 N 0 0 + 6.5000 0.7500 0.0000 C 0 0 + 6.5000 1.9500 0.0000 O 0 0 + 7.7999 0.0000 0.0000 C 0 0 + 9.0999 0.7500 0.0000 N 0 0 + 10.1394 1.3497 0.0000 N 0 0 + 1 2 1 0 + 2 3 1 0 + 3 4 2 0 + 3 5 1 0 + 5 6 1 0 + 6 7 1 0 + 7 8 2 0 + 7 9 1 0 + 9 10 2 3 + 10 11 3 0 +M END +> +NNC(=O)CNC(=O)C=N#N + +> +820-75-7 + +> +CCRIS + +> +1 + +> +. + +> +MCCANN,J, CHOI,E, YAMASAKI,E AND AMES,BN, DETECTION OF CARCINOGENS ASMUTAGENS IN THE SALMONELLA/MICROSOME TEST: ASSAY OF 300 CHEMICALS, PROC. NATL.ACAD. SCI. USA 72(12):5135-5139, 1975 + +> +0 + +> +0 + +> +0 + +> +0 + +> +157.13067 + +> +CV3 + +$$$$ + + SciTegic02060916132D + + 10 10 0 0 0 0 999 V2000 + 2.3383 -1.3500 0.0000 O 0 0 + 1.2990 -0.7500 0.0000 C 0 0 + 1.2990 0.7500 0.0000 N 0 0 + 0.0000 1.5000 0.0000 C 0 0 + 0.0000 2.7000 0.0000 O 0 0 + -1.2990 0.7500 0.0000 C 0 0 + -2.5988 1.5004 0.0000 N 0 0 + -3.6380 2.1004 0.0000 N 0 0 + -1.2990 -0.7500 0.0000 C 0 0 + 0.0000 -1.5000 0.0000 N 0 0 + 1 2 2 0 + 2 3 1 0 + 3 4 1 0 + 4 5 2 0 + 4 6 1 0 + 6 7 2 3 + 7 8 3 0 + 6 9 1 0 + 9 10 2 3 + 10 2 1 0 +M END +> +O=C1NC(=O)C(=N#N)C=N1 + +> +2435-76-9 + +> +CCRIS + +> +1 + +> +. + +> +HIRAMOTO,K, KATO,T AND KIKUGAWA,K, MECHANISMS OF THE DNA BREAKINGACTIVITY OF MUTAGENIC 5-DIAZOURACIL, MUTAT. RES. 306(2):153-163, 1994 + +> +0 + +> +1 + +> +0 + +> +0 + +> +138.08428 + +> +CV1 + +$$$$ + + SciTegic02060916132D + + 10 9 0 0 0 0 999 V2000 + 1.3000 1.9500 0.0000 N 0 0 + 1.3000 0.7500 0.0000 C 0 0 + 0.2606 0.1503 0.0000 O 0 0 + 2.6000 0.0000 0.0000 C 0 0 + 3.9000 0.7500 0.0000 N 0 0 + 5.2000 0.0000 0.0000 C 0 0 + 5.2000 -1.2000 0.0000 O 0 0 + 6.5000 0.7500 0.0000 C 0 0 + 7.7999 0.0000 0.0000 N 0 0 + 8.8394 -0.5997 0.0000 N 0 0 + 1 2 1 0 + 2 3 2 0 + 2 4 1 0 + 4 5 1 0 + 5 6 1 0 + 6 7 2 0 + 6 8 1 0 + 8 9 2 3 + 9 10 3 0 +M END +> +NC(=O)CNC(=O)C=N#N + +> +817-99-2 + +> +CCRIS + +> +1 + +> +. + +> +MCCANN,J, CHOI,E, YAMASAKI,E AND AMES,BN, DETECTION OF CARCINOGENS ASMUTAGENS IN THE SALMONELLA/MICROSOME TEST: ASSAY OF 300 CHEMICALS, PROC. NATL.ACAD. SCI. USA 72(12):5135-5139, 1975 + +> +0 + +> +0 + +> +0 + +> +0 + +> +142.11604 + +> +CV2 + +$$$$ diff --git a/python/tests/data/tox_benchmark_N6512.sdf.gz b/python/tests/data/tox_benchmark_N6512.sdf.gz new file mode 100644 index 0000000..109102a Binary files /dev/null and b/python/tests/data/tox_benchmark_N6512.sdf.gz differ diff --git a/python/tests/test_session.py b/python/tests/test_session.py index be0f985..6cd9007 100644 --- a/python/tests/test_session.py +++ b/python/tests/test_session.py @@ -744,6 +744,24 @@ def test_bed_four(): assert result.to_polars().shape == (10, 4) +def test_sdf_file(): + session = new_session() + + sdf_file = DATA / "tox_benchmark_N6512.sdf" + result = session.read_sdf_file(sdf_file.as_posix()) + + assert len(result.to_polars()) == 4 + + +def test_sdf_gzip_file(): + session = new_session() + + sdf_file = DATA / "tox_benchmark_N6512.sdf.gz" + result = session.read_sdf_file(sdf_file.as_posix()) + + assert len(result.to_polars()) == 4 + + def test_bed_long_name(): session = new_session() diff --git a/src/datasources/fastq.rs b/src/datasources/fastq.rs index c4eba36..bf4292b 100644 --- a/src/datasources/fastq.rs +++ b/src/datasources/fastq.rs @@ -20,8 +20,6 @@ use pyo3::{pyclass, pymethods}; const DEFAULT_FASTQ_FILE_EXTENSION: &str = "fastq"; -#[pyclass] -#[derive(Debug, Clone)] /// Options for reading FASTQ files. /// /// When using from Python, the arguments are optional, but if passed, must be passed as kwargs. @@ -46,7 +44,8 @@ const DEFAULT_FASTQ_FILE_EXTENSION: &str = "fastq"; /// let options = FASTQReadOptions::default(); /// assert_eq!(options.file_extension, "fastq"); /// ``` -#[derive(Default)] +#[pyclass] +#[derive(Debug, Clone, Default)] pub struct FASTQReadOptions { file_extension: Option, file_compression_type: Option, diff --git a/src/datasources/hmm_dom_tab.rs b/src/datasources/hmm_dom_tab.rs index adffd13..49ecf84 100644 --- a/src/datasources/hmm_dom_tab.rs +++ b/src/datasources/hmm_dom_tab.rs @@ -15,26 +15,54 @@ use exon::datasources::hmmdomtab::table_provider::ListingHMMDomTabTableOptions; use pyo3::{pyclass, pymethods}; -use crate::FileCompressionType; +use crate::{file_options::SettableFromFileOptions, FileCompressionType}; + +const DEFAULT_HMM_FILE_EXTENSION: &str = "hmmdomtab"; #[pyclass] #[derive(Debug, Clone, Default)] pub struct HMMDomTabReadOptions { - file_compression_type: FileCompressionType, + file_compression_type: Option, + file_extension: Option, +} + +impl SettableFromFileOptions for HMMDomTabReadOptions { + fn file_extension_mut(&mut self) -> &mut Option { + &mut self.file_extension + } + + fn file_compression_type_mut(&mut self) -> &mut Option { + &mut self.file_compression_type + } } #[pymethods] impl HMMDomTabReadOptions { #[new] - fn new(file_compression_type: Option) -> Self { + fn new( + file_extension: Option, + file_compression_type: Option, + ) -> Self { Self { - file_compression_type: file_compression_type.unwrap_or_default(), + file_extension, + file_compression_type, } } } +impl HMMDomTabReadOptions {} + impl From for ListingHMMDomTabTableOptions { fn from(options: HMMDomTabReadOptions) -> Self { - ListingHMMDomTabTableOptions::new(options.file_compression_type.into()) + let file_compression_type = options + .file_compression_type + .unwrap_or(FileCompressionType::UNCOMPRESSED); + + let file_extension = options + .file_extension + .unwrap_or(DEFAULT_HMM_FILE_EXTENSION.to_string()); + + ListingHMMDomTabTableOptions::new(file_compression_type.into()) + .with_file_extension(file_extension) } } diff --git a/src/datasources/mod.rs b/src/datasources/mod.rs index c562d4f..864c109 100644 --- a/src/datasources/mod.rs +++ b/src/datasources/mod.rs @@ -31,6 +31,7 @@ pub mod gtf; pub mod hmm_dom_tab; pub mod mzml; pub mod sam; +pub mod sdf; pub mod vcf; pub(crate) fn parse_region(region: Option) -> PyResult> { diff --git a/src/datasources/sdf.rs b/src/datasources/sdf.rs new file mode 100644 index 0000000..2a5e73b --- /dev/null +++ b/src/datasources/sdf.rs @@ -0,0 +1,62 @@ +// Copyright 2024 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use exon::datasources::sdf::ListingSDFTableOptions; +use pyo3::{pyclass, pymethods}; + +use crate::{file_options::SettableFromFileOptions, FileCompressionType}; + +#[pyclass] +#[derive(Debug, Clone, Default)] +/// Options for reading SDF files. +pub struct SDFReadOptions { + file_compression_type: Option, + file_extension: Option, +} + +impl SettableFromFileOptions for SDFReadOptions { + fn file_extension_mut(&mut self) -> &mut Option { + &mut self.file_extension + } + + fn file_compression_type_mut(&mut self) -> &mut Option { + &mut self.file_compression_type + } +} + +#[pymethods] +impl SDFReadOptions { + #[new] + #[pyo3(signature = (/, file_compression_type=None))] + /// Create a new SDFReadOptions instance. + pub fn new(file_compression_type: Option) -> Self { + Self { + file_compression_type, + file_extension: Some("sdf".to_string()), + } + } +} + +impl From for ListingSDFTableOptions { + fn from(options: SDFReadOptions) -> Self { + let mut listing_options = ListingSDFTableOptions::default(); + + if let Some(file_compression_type) = options.file_compression_type { + listing_options = + listing_options.with_file_compression_type(file_compression_type.into()); + } + + listing_options + } +} diff --git a/src/file_options.rs b/src/file_options.rs index 2b63515..c367a04 100644 --- a/src/file_options.rs +++ b/src/file_options.rs @@ -16,6 +16,11 @@ use std::{path::Path, str::FromStr}; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; +use crate::error::BioBearResult; + +mod settable_from_file_options; +pub(crate) use settable_from_file_options::SettableFromFileOptions; + #[derive(Debug, Clone, Default)] pub(crate) struct FileOptions { file_extension: Option, @@ -30,6 +35,23 @@ impl FileOptions { pub fn file_compression_type(&self) -> Option { self.file_compression_type } + + pub fn set_from_file_options( + &mut self, + settable: &mut dyn settable_from_file_options::SettableFromFileOptions, + ) -> BioBearResult<()> { + if let Some(file_extension) = self.file_extension() { + let file_options = settable.file_extension_mut(); + *file_options = Some(file_extension.to_string()); + } + + if let Some(file_compression_type) = self.file_compression_type() { + let file_options = settable.file_compression_type_mut(); + *file_options = Some(crate::FileCompressionType::try_from(file_compression_type)?); + } + + Ok(()) + } } impl From<&str> for FileOptions { diff --git a/src/file_options/settable_from_file_options.rs b/src/file_options/settable_from_file_options.rs new file mode 100644 index 0000000..e057eb7 --- /dev/null +++ b/src/file_options/settable_from_file_options.rs @@ -0,0 +1,21 @@ +// Copyright 2024 WHERE TRUE Technologies. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::FileCompressionType; + +pub(crate) trait SettableFromFileOptions { + fn file_extension_mut(&mut self) -> &mut Option; + + fn file_compression_type_mut(&mut self) -> &mut Option; +} diff --git a/src/lib.rs b/src/lib.rs index 99d45cd..727a234 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -70,6 +70,7 @@ fn biobear(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + m.add_class::()?; m.add_function(wrap_pyfunction!(session_context::connect, m)?)?; m.add_function(wrap_pyfunction!(session_context::new_session, m)?)?; diff --git a/src/session_context.rs b/src/session_context.rs index fd1fe5c..a8288ae 100644 --- a/src/session_context.rs +++ b/src/session_context.rs @@ -27,6 +27,7 @@ use crate::error; use crate::execution_result::ExecutionResult; use crate::file_options::FileOptions; use crate::runtime::wait_for_future; +use pyo3::{pyclass, pymethods}; #[pyclass] pub struct BioBearSessionContext { @@ -78,6 +79,24 @@ impl BioBearSessionContext { Ok(ExecutionResult::new(df)) } + /// Read an SDF file from the given path. + fn read_sdf_file( + &mut self, + file_path: &str, + options: Option, + py: Python, + ) -> PyResult { + let mut file_options = FileOptions::from(file_path); + let mut options = options.unwrap_or_default(); + + file_options.set_from_file_options(&mut options)?; + + let result = self.ctx.read_sdf(file_path, options.into()); + let df = wait_for_future(py, result).map_err(error::BioBearError::from)?; + + Ok(ExecutionResult::new(df)) + } + /// Read a bigwig file from the given path. fn read_bigwig_file( &mut self,