Integrate with BiocFrame to access search results (#21)

- Remove pandas dependency - Make scipy optional
BiocPy · Jan 2, 2025 · 815f2ab · 815f2ab
1 parent d90ab64
commit 815f2ab
Show file tree

Hide file tree

Showing 10 changed files with 30 additions and 23 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Changelog
 
+## Version 0.3.1
+
+- Make pandas optional. List and search functions now return a `BiocFrame` object.
+- Since scipy is only used during upload, the package loads it dynamically and makes it optional.
+
 ## Version 0.3.0
 
 - chore: Remove Python 3.8 (EOL).

diff --git a/README.md b/README.md
@@ -45,7 +45,7 @@ print(datasets[["name", "version"]].head(3))
 # |  2 | zhong-prefrontal-2018 | 2023-12-22 |
 ```
 
-This returns a pandas `DataFrame` to easily filter and download datasets of interest.
+This returns a `BiocFrame` to easily filter and download datasets of interest.
 
 Users can also search on the metadata text using the `search_datasets()` function. This accepts both simple text queries as well as more complicated expressions involving boolean operations.
 

diff --git a/setup.cfg b/setup.cfg
@@ -56,7 +56,6 @@ install_requires =
     delayedarray>=0.5.1
     summarizedexperiment
     singlecellexperiment
-    pandas
 
 [options.packages.find]
 where = src
@@ -69,6 +68,7 @@ exclude =
 # PDF = ReportLab; RXP
 optional =
     anndata
+    scipy
 
 # Add here test requirements (semicolon/line-separated)
 testing =

diff --git a/src/scrnaseq/list_datasets.py b/src/scrnaseq/list_datasets.py
@@ -2,7 +2,7 @@
 import sqlite3
 from functools import lru_cache
 
-import pandas as pd
+from biocframe import BiocFrame
 from gypsum_client import (
     cache_directory,
     fetch_metadata_database,
@@ -14,7 +14,7 @@
 
 
 @lru_cache
-def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> pd.DataFrame:
+def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> BiocFrame:
     """List all available datasets.
 
     Example:
@@ -38,7 +38,7 @@ def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, l
             Defaults to True.
 
     Returns:
-        A :py:class:`~pandas.DataFrame` where each row corresponds to a dataset.
+        A :py:class:`~biocframe.BiocFrame` where each row corresponds to a dataset.
         Each row contains title and description for each dataset,
         the number of rows and columns, the organisms and genome builds involved,
         whether the dataset has any pre-computed reduced dimensions, and so on.
@@ -67,7 +67,7 @@ def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, l
 
 
 def _format_query_results(results: list, key_names: list):
-    """Format the results from sqlite as a pandas dataframe.
+    """Format the results from sqlite as a BiocFrame.
 
     Key names must be in the exact same order as the query.
     """
@@ -85,7 +85,7 @@ def _format_query_results(results: list, key_names: list):
 def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "meta"):
     _all_paths = [None if "/" not in p else p.rsplit("/", 1)[0] for p in results["path"]]
 
-    df = pd.DataFrame(
+    df = BiocFrame(
         {
             "name": results["asset"],
             "version": results["version"],
@@ -150,10 +150,10 @@ def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "met
     for meta in _all_metas:
         cursources = meta.get("sources")
         if cursources is None:
-            sources.append(pd.DataFrame(columns=["provider", "id", "version"]))
+            sources.append(BiocFrame(columns=["provider", "id", "version"]))
         else:
             sources.append(
-                pd.DataFrame(
+                BiocFrame(
                     {
                         "provider": [s.get("provider") for s in cursources],
                         "id": [s.get("id") for s in cursources],

diff --git a/src/scrnaseq/polish_dataset.py b/src/scrnaseq/polish_dataset.py
@@ -1,7 +1,6 @@
 from typing import Type
 
 import numpy as np
-from scipy import sparse as sp
 from singlecellexperiment import SingleCellExperiment
 from summarizedexperiment import SummarizedExperiment
 
@@ -80,6 +79,8 @@ def _polish_dataset(
     for asyname, asy in x.assays.items():
         if reformat_assay_by_density is not None:
             density = min(np.mean(asy != 0), np.mean(asy != np.nan))
+            from scipy import sparse as sp
+
             if density < reformat_assay_by_density:
                 if not sp.issparse(asy):
                     asy = sp.csr_matrix(asy)
@@ -90,6 +91,8 @@ def _polish_dataset(
         if attempt_integer_conversion:
             if np.issubdtype(asy.dtype, np.floating):
                 _cast = False
+                from scipy import sparse as sp
+
                 if sp.issparse(asy):
                     if not np.any(asy.data % 1 != 0):
                         _cast = True

diff --git a/src/scrnaseq/search_datasets.py b/src/scrnaseq/search_datasets.py
@@ -2,7 +2,7 @@
 from functools import lru_cache
 from typing import Union
 
-import pandas as pd
+from biocframe import BiocFrame
 from gypsum_client import cache_directory, fetch_metadata_database
 from gypsum_client.search_metadata import (
     GypsumSearchClause,
@@ -22,7 +22,7 @@ def search_datasets(
     cache_dir: str = cache_directory(),
     overwrite: bool = False,
     latest: bool = True,
-) -> pd.DataFrame:
+) -> BiocFrame:
     """Search for datasets of interest based on matching text in the associated metadata.
 
     This is a wrapper around
@@ -76,7 +76,7 @@ def search_datasets(
             Defaults to True.
 
     Returns:
-        A :py:class:`~pandas.DataFrame` where each row corresponds to
+        A :py:class:`~biocframe.BiocFrame` where each row corresponds to
         a dataset, containing various columns of metadata.
         Some columns may be lists to capture 1:many mappings.
     """

diff --git a/tests/test_list_dataset.py b/tests/test_list_dataset.py
@@ -1,6 +1,6 @@
 import tempfile
 
-import pandas as pd
+from biocframe import BiocFrame
 from scrnaseq import list_datasets
 
 __author__ = "Jayaram Kancherla"
@@ -11,5 +11,5 @@
 def test_list_dataset():
     datasets = list_datasets(cache_dir=tempfile.mkdtemp())
 
-    assert isinstance(datasets, pd.DataFrame)
+    assert isinstance(datasets, BiocFrame)
     assert len(datasets) > 80
diff --git a/tests/test_save_dataset.py b/tests/test_save_dataset.py
@@ -6,7 +6,6 @@
 import anndata as ad
 import dolomite_base as dl
 import numpy as np
-import pandas as pd
 import pytest
 from dolomite_matrix import ReloadedArray
 from scrnaseq import fetch_dataset, save_dataset

diff --git a/tests/test_search_datasets.py b/tests/test_search_datasets.py
@@ -1,4 +1,4 @@
-import pandas as pd
+from biocframe import BiocFrame
 from gypsum_client import define_text_query
 from scrnaseq import search_datasets
 
@@ -10,14 +10,14 @@
 def test_search_datasets():
     res = search_datasets("brain")
     assert len(res) > 10
-    assert isinstance(res, pd.DataFrame)
+    assert isinstance(res, BiocFrame)
 
     res = search_datasets(define_text_query("Neuro%", partial=True))
-    assert isinstance(res, pd.DataFrame)
+    assert isinstance(res, BiocFrame)
     assert len(res) > 0
 
     res = search_datasets(define_text_query("10090", field="taxonomy_id"))
-    assert isinstance(res, pd.DataFrame)
+    assert isinstance(res, BiocFrame)
     assert len(res) > 0
 
     res = search_datasets(
@@ -27,5 +27,5 @@ def test_search_datasets():
             | define_text_query("pancrea%", partial=True)
         )
     )
-    assert isinstance(res, pd.DataFrame)
+    assert isinstance(res, BiocFrame)
     assert len(res) > 0
diff --git a/tests/test_upload_dataset.py b/tests/test_upload_dataset.py
@@ -7,7 +7,7 @@
 import dolomite_base as dl
 import dolomite_matrix as dlm
 import numpy as np
-import pandas as pd
+import datetime
 import pytest
 from biocframe import BiocFrame
 from gypsum_client import prepare_directory_upload
@@ -139,7 +139,7 @@ def test_actual_upload_works_correctly():
 
     app_url = "https://gypsum.artifactdb.com"
 
-    version = str(pd.Timestamp.today().date())
+    version = str(datetime.datatime.now().date())
     upload_dataset(tmp, "test", version, probation=True, url=app_url, token=gh_token)
     fetch_dataset.cache_clear()  # Clear cache before fetching