From 4c2aee78a2656af5ec62f04e64f6a587f24154f0 Mon Sep 17 00:00:00 2001 From: Trent Hauck Date: Sat, 21 Oct 2023 13:10:22 -0700 Subject: [PATCH] docs: better for duckdb --- Makefile | 4 +++- README.md | 27 +++++++++++++++++++++++++++ python/tests/test_session.py | 4 ++-- src/execution_result.rs | 2 +- 4 files changed, 33 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index d63d391..0ce78dd 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,9 @@ build: cargo build --release maturin develop --release -test: build +test: + cargo build + maturin develop pytest run-benchmarks: diff --git a/README.md b/README.md index ed2f757..d82cc4f 100644 --- a/README.md +++ b/README.md @@ -87,6 +87,32 @@ df.head() # └──────────────┴─────────────────┴──────┴───────┴───┴────────────┴────────┴───────┴───────────────────────────────────┘ ``` +## Using DuckDB + +biobear can also be used to read files into a [duckdb][] database. + +```python +import biobear as bb +import duckdb + +session = bb.connect() + +session.sql(""" + CREATE EXTERNAL TABLE gene_annotations STORED AS GFF LOCATION 'python/tests/data/test.gff' +""") + +result = session.sql(""" + SELECT * FROM gene_annotations +""") + +gff_table_arrow_table = result.to_arrow() + +duckdb_conn = duckdb.connect() + +result = duckdb_conn.execute('SELECT * FROM gff_table_arrow_table').fetchall() +print(result) +``` + ## Performance Please see the [exon][]'s performance metrics for thorough benchmarks, but in short, biobear is generally faster than other Python libraries for reading bioinformatic file formats. @@ -101,3 +127,4 @@ For example, here's quick benchmarks for reading one FASTA file with 1 million r The larger difference multiple files is due to biobear's ability to read multiple files in parallel. [exon]: https://github.com/wheretrue/exon/tree/main/exon-benchmarks +[duckdb]: https://duckdb.org/ diff --git a/python/tests/test_session.py b/python/tests/test_session.py index 6d0f4d0..460270e 100644 --- a/python/tests/test_session.py +++ b/python/tests/test_session.py @@ -22,7 +22,7 @@ DATA = Path(__file__).parent / "data" -def test_connect(): +def test_connect_and_to_arrow(): """Test connecting to a context.""" session = connect() @@ -32,7 +32,7 @@ def test_connect(): session.sql(query) query = "SELECT * FROM gff_file" - arrow_table = session.sql(query).to_arrow_table() + arrow_table = session.sql(query).to_arrow() assert len(arrow_table) == 2 diff --git a/src/execution_result.rs b/src/execution_result.rs index 271da4f..f64c9ef 100644 --- a/src/execution_result.rs +++ b/src/execution_result.rs @@ -50,7 +50,7 @@ impl PyExecutionResult { } /// Convert to Arrow Table - fn to_arrow_table(&self, py: Python) -> PyResult { + fn to_arrow(&self, py: Python) -> PyResult { let batches = self.collect(py)?.to_object(py); Python::with_gil(|py| {