Skip to content

Commit

Permalink
feat: add BZIP2 (.bz2) support for reading fasta and fastq (#185)
Browse files Browse the repository at this point in the history
  • Loading branch information
nickzoic authored Dec 15, 2024
1 parent 17d6bf2 commit 95bf089
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 0 deletions.
1 change: 1 addition & 0 deletions python/biobear/biobear.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class FileCompressionType(enum.Enum):
GZIP = 0
BGZIP = 1
NONE = 2
BZIP2 = 3

class FastaSequenceDataType(enum.Enum):
"""How to treat the sequence data in a FASTA file."""
Expand Down
3 changes: 3 additions & 0 deletions python/biobear/compression.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,15 @@ class Compression(Enum):
INFERRED = "INFERRED"
NONE = "NONE"
GZIP = "GZIP"
BZIP2 = "BZIP2"

@classmethod
def from_file(cls, path: os.PathLike) -> "Compression":
"""Infer the compression type from the file extension."""
if Path(path).suffix == ".gz":
return Compression.GZIP
if Path(path).suffix == ".bz2":
return Compression.BZIP2
return Compression.NONE

def infer_or_use(self, path: os.PathLike) -> "Compression":
Expand Down
Binary file added python/tests/data/test.fa.bz2
Binary file not shown.
Binary file added python/tests/data/test.fq.bz2
Binary file not shown.
20 changes: 20 additions & 0 deletions python/tests/test_session.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,14 @@ def test_read_fastq():

assert len(df) == 2

fastq_path = DATA / "test.fq.bz2"
options = FASTQReadOptions(
file_extension="fq", file_compression_type=FileCompressionType.BZIP2
)

df = session.read_fastq_file(str(fastq_path), options=options).to_polars()

assert len(df) == 2

@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
Expand Down Expand Up @@ -285,6 +293,18 @@ def test_read_fasta_gz():

assert len(df) == 2

def test_read_fasta_bz2():
"""Test reading a fasta.bz2 file."""
session = connect()

fasta_path = DATA / "test.fa.bz2"

options = FASTAReadOptions(
file_extension="fa", file_compression_type=FileCompressionType.BZIP2
)
df = session.read_fasta_file(str(fasta_path), options=options).to_polars()

assert len(df) == 2

@pytest.mark.skipif(
not importlib.util.find_spec("polars"), reason="polars not installed"
Expand Down
5 changes: 5 additions & 0 deletions src/file_compression_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pub enum FileCompressionType {
GZIP,
ZSTD,
UNCOMPRESSED,
BZIP2,
}

impl Default for FileCompressionType {
Expand Down Expand Up @@ -56,6 +57,7 @@ impl Display for FileCompressionType {
Self::GZIP => write!(f, "GZIP"),
Self::ZSTD => write!(f, "ZSTD"),
Self::UNCOMPRESSED => write!(f, "UNCOMPRESSED"),
Self::BZIP2 => write!(f, "BZIP2"),
}
}
}
Expand All @@ -66,6 +68,7 @@ impl From<FileCompressionType> for DFFileCompressionType {
FileCompressionType::GZIP => DFFileCompressionType::GZIP,
FileCompressionType::ZSTD => DFFileCompressionType::ZSTD,
FileCompressionType::UNCOMPRESSED => DFFileCompressionType::UNCOMPRESSED,
FileCompressionType::BZIP2 => DFFileCompressionType::BZIP2,
}
}
}
Expand All @@ -78,6 +81,7 @@ impl TryFrom<CompressionTypeVariant> for FileCompressionType {
CompressionTypeVariant::GZIP => Ok(Self::GZIP),
CompressionTypeVariant::ZSTD => Ok(Self::ZSTD),
CompressionTypeVariant::UNCOMPRESSED => Ok(Self::UNCOMPRESSED),
CompressionTypeVariant::BZIP2 => Ok(Self::BZIP2),
_ => Err(BioBearError::InvalidCompressionType(value.to_string())),
}
}
Expand All @@ -91,6 +95,7 @@ impl TryFrom<DFFileCompressionType> for FileCompressionType {
DFFileCompressionType::GZIP => Ok(Self::GZIP),
DFFileCompressionType::ZSTD => Ok(Self::ZSTD),
DFFileCompressionType::UNCOMPRESSED => Ok(Self::UNCOMPRESSED),
DFFileCompressionType::BZIP2 => Ok(Self::BZIP2),
_ => Err(BioBearError::InvalidCompressionType(
"Invalid compression type".to_string(),
)),
Expand Down

0 comments on commit 95bf089

Please sign in to comment.