Skip to content

Commit

Permalink
feat: add cram and fcs (#127)
Browse files Browse the repository at this point in the history
* feat: add cram and fcs
* fix: bad name
  • Loading branch information
tshauck authored Apr 23, 2024
1 parent 22fc7c7 commit 50ac25e
Show file tree
Hide file tree
Showing 20 changed files with 21,005 additions and 15 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ name = "biobear"
[dependencies]
arrow = { version = "51.0.0", features = ["pyarrow"] }
datafusion = "37"
exon = { version = "0.18.0", features = ["default"] }
exon = { version = "0.19.1", features = ["default"] }
pyo3 = "0.20"
tokio = { version = "1", features = ["rt"] }
noodles = { version = "0.70", features = ["core"] }
Expand Down
4 changes: 4 additions & 0 deletions python/biobear/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@
from .biobear import HMMDomTabReadOptions
from .biobear import MzMLReadOptions
from .biobear import GenBankReadOptions
from .biobear import FCSReadOptions
from .biobear import CRAMReadOptions
from .biobear import connect
from .biobear import new_session
from .biobear import __runtime
Expand Down Expand Up @@ -68,6 +70,8 @@
"BCFReadOptions",
"VCFReadOptions",
"BEDReadOptions",
"FCSReadOptions",
"CRAMReadOptions",
"BigWigReadOptions",
"SAMReadOptions",
"BAMReadOptions",
Expand Down
23 changes: 23 additions & 0 deletions python/biobear/biobear.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,21 @@ class FileCompressionType(enum.Enum):
BGZIP = 1
NONE = 2

class CRAMReadOptions:
def __init__(
self,
/,
region: Optional[str] = None,
fasta_reference: Optional[str] = None,
) -> None: ...

class FCSReadOptions:
def __init__(
self,
/,
file_compression_type: Optional[FileCompressionType] = None,
) -> None: ...

class HMMDomTabReadOptions:
def __init__(
self,
Expand Down Expand Up @@ -90,6 +105,8 @@ class BAMReadOptions:
class BEDReadOptions:
def __init__(
self,
/,
file_compression_type: Optional[FileCompressionType] = None,
) -> None: ...

class BigWigReadOptions:
Expand Down Expand Up @@ -154,6 +171,12 @@ class BioBearSessionContext:
def read_genbank_file(
self, file_path: str, /, options: Optional[GenBankReadOptions]
) -> ExecutionResult: ...
def read_cram_file(
self, file_path: str, /, options: Optional[CRAMReadOptions]
) -> ExecutionResult: ...
def read_fcs_file(
self, file_path: str, /, options: Optional[FCSReadOptions]
) -> ExecutionResult: ...
def sql(self, query: str) -> ExecutionResult: ...
def execute(self, query: str) -> None: ...

Expand Down
Binary file added python/tests/data/cram/0500_mapped.cram
Binary file not shown.
Binary file not shown.
Binary file not shown.
20,803 changes: 20,803 additions & 0 deletions python/tests/data/cram/ce.fa

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions python/tests/data/cram/ce.fa.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
CHROMOSOME_I 1009800 14 50 51
CHROMOSOME_II 5000 1030025 50 51
CHROMOSOME_III 5000 1035141 50 51
CHROMOSOME_IV 5000 1040256 50 51
CHROMOSOME_V 5000 1045370 50 51
CHROMOSOME_X 5000 1050484 50 51
CHROMOSOME_MtDNA 5000 1055602 50 51
Binary file added python/tests/data/cram/test_input_1_a.cram
Binary file not shown.
18 changes: 18 additions & 0 deletions python/tests/data/two-cram/rand1k.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
>rand1k
TCCTAATTCTGGGTAACCGCCGCCTGAAGCCAAAAAATAAGCCGGAGCCAAGGGGGAGTC
ACACTGCTCCACGAACGCCTCTCATACAGCTTCGTCTTACAGGTGGAGATCATTGTCCCG
GAGAGTCATGTGCCTTAGTTAAAAAGGTTTACTGCGCTCGGGGTCGAGTGCGGGAACTTC
TCGGGTGGCTACGTACCGGGGCCTACCTTGCTATCTTTGACAATCAGCGTTCTGGATTGT
CAGGCTCACTTCCGTAGCAGTTGCTGGAGAATATGGACATATCAGCTTTGACACACTGGG
TTAAGGCGTAGGGTAGAGACGGAGTCCCTTCGCTGCCAATGTGGTGGTTTGGGACGAGTA
TCATGTTGGTGCCCCAAGTTAACTTACTCCGCCCATGTCGTGCGATTACGCGAGAGTAGT
AGATCGCTACGAGTATGTCCTCGGTGATCTAGTTAACTACTGTTACTGATGTCCGTTGCT
CCACAGGTATACTCGGACACAATTCACGGGCTCCTCAAGCATACTAAAGAAGTCACGAGT
GACGTCGGCGTAACCTCACATTAGTGGAGGAACCCGTGTGGAACATCATTCTAACGACAC
TGTCGATCCCGGATGGATATGGTAGTCTTGATTATCCAGAGTCTTAGAGACATGGTAAGT
TAGGAGCGCAGGACCATCAACTCTACTTTCCGGCAAATGTTAAGGGGTTTTGCTGACCAC
CCGCATGCTTACAGTCCCGTTTCGCTAAGGTCTTCCTCGCTGCCTCTAGTTTTAGCGGAC
GTTCCTTTCTCAACTAGTCTATTTGTTCATACTCATTTGGCACAGGTCTGTGTACGTCTA
TCATGCGGACTAAATTACCCACAGAATGTCACAGGACAACATAGTGTTCATCATCCCTGT
GGGATAACCGGTTACCTCGGATGAGGAGTATGAACTATATCTTAGCGTAGACCGATGTAT
GGAAAGGCGCAGCCTCTGGCCGCCCACTATCGGAAATCGT
1 change: 1 addition & 0 deletions python/tests/data/two-cram/rand1k.fa.fai
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
rand1k 1000 9 60 61
Binary file added python/tests/data/two-cram/twolib.sorted.cram
Binary file not shown.
Binary file added python/tests/data/two-cram/twolib.sorted.cram.crai
Binary file not shown.
29 changes: 29 additions & 0 deletions python/tests/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
VCFReadOptions,
GTFReadOptions,
MzMLReadOptions,
CRAMReadOptions,
new_session,
)

Expand Down Expand Up @@ -499,3 +500,31 @@ def test_genbank_reader():
df = result.to_polars()

assert len(df) == 1


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_cram_reader():
session = new_session()

result = session.read_cram_file((DATA / "cram" / "test_input_1_a.cram").as_posix())

assert len(result.to_polars()) == 15


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_cram_reader_with_region():
session = new_session()

fasta_reference = (DATA / "two-cram" / "rand1k.fa").as_posix()
options = CRAMReadOptions(region="1", fasta_reference=fasta_reference)

result = session.read_cram_file(
(DATA / "two-cram" / "twolib.sorted.cram").as_posix(),
options=options,
)

assert len(result.to_polars()) == 0
14 changes: 3 additions & 11 deletions src/datasources/bam.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::str::FromStr;

use exon::datasources::bam::table_provider::ListingBAMTableOptions;
use noodles::core::Region;
use pyo3::{pyclass, pymethods, PyResult};

use super::parse_region;

#[pyclass]
#[derive(Debug, Clone, Default)]
pub struct BAMReadOptions {
Expand All @@ -28,15 +28,7 @@ pub struct BAMReadOptions {
impl BAMReadOptions {
#[new]
pub fn try_new(region: Option<String>) -> PyResult<Self> {
let region = region
.map(|r| Region::from_str(&r))
.transpose()
.map_err(|e| {
crate::error::BioBearError::ParserError(format!(
"Couldn\'t parse region error {}",
e
))
})?;
let region = parse_region(region)?;

Ok(Self { region })
}
Expand Down
52 changes: 52 additions & 0 deletions src/datasources/cram.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
// Copyright 2024 WHERE TRUE Technologies.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use exon::datasources::cram;
use noodles::core::Region;
use pyo3::{pyclass, pymethods, PyResult};

use super::parse_region;

#[pyclass]
#[derive(Debug, Clone, Default)]
pub struct CRAMReadOptions {
region: Option<Region>,
fasta_reference: Option<String>,
}

#[pymethods]
impl CRAMReadOptions {
#[new]
pub fn try_new(region: Option<String>, fasta_reference: Option<String>) -> PyResult<Self> {
let region = parse_region(region)?;

Ok(Self {
region,
fasta_reference,
})
}
}

impl From<CRAMReadOptions> for cram::table_provider::ListingCRAMTableOptions {
fn from(options: CRAMReadOptions) -> Self {
let mut t = cram::table_provider::ListingCRAMTableOptions::default()
.with_fasta_reference(options.fasta_reference);

if let Some(region) = options.region {
t = t.with_region(Some(region)).with_indexed(true);
}

t
}
}
42 changes: 42 additions & 0 deletions src/datasources/fcs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// Copyright 2024 WHERE TRUE Technologies.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use exon::datasources::fcs::table_provider::ListingFCSTableOptions;
use pyo3::{pyclass, pymethods};

use crate::FileCompressionType;

#[pyclass]
#[derive(Debug, Clone, Default)]
pub struct FCSReadOptions {
// File compression type
file_compression_type: FileCompressionType,
}

#[pymethods]
impl FCSReadOptions {
#[new]
pub fn new(file_compression_type: Option<FileCompressionType>) -> Self {
Self {
file_compression_type: file_compression_type.unwrap_or_default(),
}
}
}

impl From<FCSReadOptions> for ListingFCSTableOptions {
fn from(options: FCSReadOptions) -> Self {
ListingFCSTableOptions::default()
.with_file_compression_type(options.file_compression_type.into())
}
}
7 changes: 4 additions & 3 deletions src/datasources/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,16 @@
use std::str::FromStr;

use noodles::core::Region;

use crate::error::BioBearResult;
use pyo3::PyResult;

pub mod bam;
pub mod bcf;
pub mod bed;
pub mod bigwig;
pub mod cram;
pub mod fasta;
pub mod fastq;
pub mod fcs;
pub mod genbank;
pub mod gff;
pub mod gtf;
Expand All @@ -32,7 +33,7 @@ pub mod mzml;
pub mod sam;
pub mod vcf;

pub(crate) fn parse_region(region: Option<String>) -> BioBearResult<Option<noodles::core::Region>> {
pub(crate) fn parse_region(region: Option<String>) -> PyResult<Option<noodles::core::Region>> {
let region = region
.map(|r| Region::from_str(&r))
.transpose()
Expand Down
2 changes: 2 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ fn biobear(_py: Python, m: &PyModule) -> PyResult<()> {
m.add_class::<datasources::mzml::MzMLReadOptions>()?;
m.add_class::<datasources::hmm_dom_tab::HMMDomTabReadOptions>()?;
m.add_class::<datasources::genbank::GenBankReadOptions>()?;
m.add_class::<datasources::cram::CRAMReadOptions>()?;
m.add_class::<datasources::fcs::FCSReadOptions>()?;

m.add_function(wrap_pyfunction!(session_context::connect, m)?)?;
m.add_function(wrap_pyfunction!(session_context::new_session, m)?)?;
Expand Down
16 changes: 16 additions & 0 deletions src/session_context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,22 @@ impl BioBearSessionContext {
Ok(PyExecutionResult::new(df))
}

/// Read a CRAM file from the given path.
#[pyo3(signature = (file_path, /, options=None))]
fn read_cram_file(
&mut self,
file_path: &str,
options: Option<crate::datasources::cram::CRAMReadOptions>,
py: Python,
) -> PyResult<PyExecutionResult> {
let options = options.unwrap_or_default();

let result = self.ctx.read_cram(file_path, options.into());
let df = wait_for_future(py, result).map_err(error::BioBearError::from)?;

Ok(PyExecutionResult::new(df))
}

/// Read a mzml file from the given path.
#[pyo3(signature = (file_path, /, options=None))]
fn read_mzml_file(
Expand Down

0 comments on commit 50ac25e

Please sign in to comment.