Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add fastq/fasta reader on a session #105

Merged
merged 5 commits into from
Mar 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ name = "biobear"
[dependencies]
arrow = { version = "50.0.0", features = ["pyarrow"] }
datafusion = "36"
exon = { version = "0.10.0", features = ["all"] }
exon = { version = "0.11.1", features = ["all"] }
pyo3 = "0.20"
tokio = { version = "1", features = ["rt"] }

Expand Down
2 changes: 1 addition & 1 deletion bin/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,4 @@ aws --endpoint-url=http://localhost:4566 s3 mb s3://parquet-bucket
# Make the bucket public.
aws --endpoint-url=http://localhost:4566 s3api put-bucket-acl --bucket parquet-bucket --acl public-read

pytest -v
pytest -v -s
9 changes: 8 additions & 1 deletion python/biobear/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,16 @@
from biobear.mzml_reader import MzMLReader
from biobear.genbank_reader import GenbankReader
from biobear.bcf_reader import BCFReader, BCFIndexedReader
from biobear.session import connect

from biobear import compression
from biobear.compression import Compression

from .biobear import __runtime
from .biobear import connect

from .biobear import FileCompressionType
from .biobear import FASTQReadOptions
from .biobear import FASTAReadOptions

__version__ = "0.15.3"

Expand All @@ -47,6 +51,9 @@
"MzMLReader",
"compression",
"Compression",
"FileCompressionType",
"FASTQReadOptions",
"FASTAReadOptions",
"__version__",
"connect",
"__runtime",
Expand Down
17 changes: 0 additions & 17 deletions python/biobear/session.py

This file was deleted.

4 changes: 4 additions & 0 deletions python/tests/data/test.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
>a description
ATCG
>b description2
ATCG
Binary file added python/tests/data/test.fa.gz
Binary file not shown.
8 changes: 8 additions & 0 deletions python/tests/data/test.fq
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
@SEQ_ID
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
+This is a description
!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
@SEQ_ID2
GATTTGGGGTTCAAAGCAGTATCGATCAAATAGTAAATCCATTTGTTCAACTCACAGTTT
+
!''*((((***+))%%%++)(%%%%).1***-+*''))**55CCF>>>>>>CCCCCCC65
Binary file added python/tests/data/test.fq.gz
Binary file not shown.
72 changes: 71 additions & 1 deletion python/tests/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

import pytest

from biobear import connect
from biobear import connect, FASTQReadOptions, FASTAReadOptions, FileCompressionType

DATA = Path(__file__).parent / "data"

Expand All @@ -37,6 +37,76 @@ def test_connect_and_to_arrow():
assert len(arrow_table) == 2


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_read_fastq():
"""Test reading a fastq file."""
session = connect()

fastq_path = DATA / "test.fq.gz"
options = FASTQReadOptions(
file_extension="fq", file_compression_type=FileCompressionType.GZIP
)

df = session.read_fastq_file(str(fastq_path), options=options).to_polars()

assert len(df) == 2

fastq_path = DATA / "test.fq"
options = FASTQReadOptions(file_extension="fq")

df = session.read_fastq_file(str(fastq_path), options=options).to_polars()

assert len(df) == 2


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_read_fasta():
"""Test reading a fasta file."""
session = connect()

fasta_path = DATA / "test.fasta"

df = session.read_fasta_file(str(fasta_path)).to_polars()

assert len(df) == 2


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_read_fasta_fa():
"""Test reading a fasta file."""
session = connect()

fasta_path = DATA / "test.fa"

options = FASTAReadOptions(file_extension="fa")
df = session.read_fasta_file(str(fasta_path), options=options).to_polars()

assert len(df) == 2


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
def test_read_fasta_gz():
"""Test reading a fasta file."""
session = connect()

fasta_path = DATA / "test.fa.gz"

options = FASTAReadOptions(
file_extension="fa.gz", file_compression_type=FileCompressionType.GZIP
)
df = session.read_fasta_file(str(fasta_path), options=options).to_polars()

assert len(df) == 2


@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
)
Expand Down
100 changes: 100 additions & 0 deletions src/datasources/fasta.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Copyright 2024 WHERE TRUE Technologies.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::{error::BioBearResult, file_compression_type::FileCompressionType};
use datafusion::datasource::file_format::file_compression_type::FileCompressionType as DFFileCompressionType;
use exon::datasources::fasta::table_provider::ListingFASTATableOptions;
use pyo3::{pyclass, pymethods};

const DEFAULT_FASTA_FILE_EXTENSION: &str = "fasta";

#[pyclass]
#[derive(Debug, Clone)]
/// Options for reading FASTA files.
///
/// When using from Python, the arguments are optional, but if passed, must be passed as kwargs.
///
/// ```python
/// from exon import FASTAReadOptions
///
/// # Create a new FASTAReadOptions instance with the default values.
/// options = FASTAReadOptions()
///
/// # Create a new FASTAReadOptions instance with the given file extension and file compression type.
/// options = FASTAReadOptions(file_extension="fa", file_compression_type=FileCompressionType.GZIP)
/// ```
///
/// # Examples
///
/// Create a new FASTAReadOptions instance with the default values.
///
/// ```rust
/// use exon::datasources::fasta::FASTAReadOptions;
///
/// let options = FASTAReadOptions::default();
/// assert_eq!(options.file_extension, "fasta");
/// ```
pub struct FASTAReadOptions {
file_extension: String,
file_compression_type: DFFileCompressionType,
}

impl Default for FASTAReadOptions {
fn default() -> Self {
Self {
file_extension: String::from(DEFAULT_FASTA_FILE_EXTENSION),
file_compression_type: DFFileCompressionType::UNCOMPRESSED,
}
}
}

#[pymethods]
impl FASTAReadOptions {
#[new]
#[pyo3(signature = (*, file_extension=None, file_compression_type=None))]
/// Create a new FASTAReadOptions instance.
///
/// # Arguments
///
/// * `file_extension` - The file extension to use for the FASTA file.
/// * `file_compression_type` - The file compression type to use for the FASTA file.
///
/// # Returns
///
/// A new FASTAReadOptions instance.
///
/// # Note
///
/// The arguments are optional in Python, but if passed, must be passed as kwargs.
pub fn new(
file_extension: Option<String>,
file_compression_type: Option<FileCompressionType>,
) -> BioBearResult<Self> {
let df_compression = file_compression_type
.unwrap_or(FileCompressionType::UNCOMPRESSED)
.try_into()?;

Ok(Self {
file_compression_type: df_compression,
file_extension: file_extension.unwrap_or(DEFAULT_FASTA_FILE_EXTENSION.to_string()),
})
}
}

impl From<FASTAReadOptions> for ListingFASTATableOptions {
fn from(options: FASTAReadOptions) -> Self {
ListingFASTATableOptions::new(options.file_compression_type)
.with_file_extension(options.file_extension)
}
}
106 changes: 106 additions & 0 deletions src/datasources/fastq.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
// Copyright 2024 WHERE TRUE Technologies.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use crate::{error::BioBearResult, file_compression_type::FileCompressionType};
use datafusion::datasource::file_format::file_compression_type::FileCompressionType as DFFileCompressionType;
use exon::datasources::fastq::table_provider::ListingFASTQTableOptions;
use pyo3::{pyclass, pymethods};

const DEFAULT_FASTQ_FILE_EXTENSION: &str = "fastq";

#[pyclass]
#[derive(Debug, Clone)]
/// Options for reading FASTQ files.
///
/// When using from Python, the arguments are optional, but if passed, must be passed as kwargs.
///
/// ```python
/// from exon import FASTQReadOptions
///
/// # Create a new FASTQReadOptions instance with the default values.
/// options = FASTQReadOptions()
///
/// # Create a new FASTQReadOptions instance with the given file extension and file compression type.
/// options = FASTQReadOptions(file_extension="fq", file_compression_type=FileCompressionType.GZIP)
/// ```
///
/// # Examples
///
/// Create a new FASTQReadOptions instance with the default values.
///
/// ```rust
/// use exon::datasources::fastq::FASTQReadOptions;
///
/// let options = FASTQReadOptions::default();
/// assert_eq!(options.file_extension, "fastq");
/// ```
pub struct FASTQReadOptions {
file_extension: String,
file_compression_type: DFFileCompressionType,
}

impl Default for FASTQReadOptions {
fn default() -> Self {
Self {
file_extension: DEFAULT_FASTQ_FILE_EXTENSION.to_string(),
file_compression_type: DFFileCompressionType::UNCOMPRESSED,
}
}
}

#[pymethods]
impl FASTQReadOptions {
#[new]
#[pyo3(signature = (*, file_extension=None, file_compression_type=None))]
/// Create a new FASTQReadOptions instance.
///
/// # Arguments
///
/// * `file_extension` - The file extension to use for the FASTQ file.
/// * `file_compression_type` - The file compression type to use for the FASTQ file.
///
/// # Returns
///
/// A new FASTQReadOptions instance.
///
/// # Note
///
/// The arguments are optional in Python, but if passed, must be passed as kwargs.
pub fn new(
file_extension: Option<String>,
file_compression_type: Option<FileCompressionType>,
) -> BioBearResult<Self> {
let file_compression_type = file_compression_type
.unwrap_or(FileCompressionType::UNCOMPRESSED)
.try_into()?;

let file_extension = file_extension.unwrap_or(DEFAULT_FASTQ_FILE_EXTENSION.to_string());

Ok(Self {
file_extension,
file_compression_type,
})
}

fn __repr__(&self) -> String {
format!("{:?}", self)
}
}

impl From<FASTQReadOptions> for ListingFASTQTableOptions {
fn from(options: FASTQReadOptions) -> Self {
ListingFASTQTableOptions::new(options.file_compression_type)
.with_file_extension(options.file_extension)
}
}
16 changes: 16 additions & 0 deletions src/datasources/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright 2024 WHERE TRUE Technologies.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

pub mod fasta;
pub mod fastq;
Loading
Loading