diff --git a/CHANGELOG.md b/CHANGELOG.md index cc01007..746fb2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Changelog +## Version 0.3.1 + +- Make pandas optional. List and search functions now return a `BiocFrame` object. +- Since scipy is only used during upload, the package loads it dynamically and makes it optional. + ## Version 0.3.0 - chore: Remove Python 3.8 (EOL). diff --git a/README.md b/README.md index f565faf..5f4a3eb 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ print(datasets[["name", "version"]].head(3)) # | 2 | zhong-prefrontal-2018 | 2023-12-22 | ``` -This returns a pandas `DataFrame` to easily filter and download datasets of interest. +This returns a `BiocFrame` to easily filter and download datasets of interest. Users can also search on the metadata text using the `search_datasets()` function. This accepts both simple text queries as well as more complicated expressions involving boolean operations. diff --git a/setup.cfg b/setup.cfg index c7c89aa..4293179 100644 --- a/setup.cfg +++ b/setup.cfg @@ -56,7 +56,6 @@ install_requires = delayedarray>=0.5.1 summarizedexperiment singlecellexperiment - pandas [options.packages.find] where = src @@ -69,6 +68,7 @@ exclude = # PDF = ReportLab; RXP optional = anndata + scipy # Add here test requirements (semicolon/line-separated) testing = diff --git a/src/scrnaseq/list_datasets.py b/src/scrnaseq/list_datasets.py index 9163266..c087ac9 100644 --- a/src/scrnaseq/list_datasets.py +++ b/src/scrnaseq/list_datasets.py @@ -2,7 +2,7 @@ import sqlite3 from functools import lru_cache -import pandas as pd +from biocframe import BiocFrame from gypsum_client import ( cache_directory, fetch_metadata_database, @@ -14,7 +14,7 @@ @lru_cache -def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> pd.DataFrame: +def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> BiocFrame: """List all available datasets. Example: @@ -38,7 +38,7 @@ def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, l Defaults to True. Returns: - A :py:class:`~pandas.DataFrame` where each row corresponds to a dataset. + A :py:class:`~biocframe.BiocFrame` where each row corresponds to a dataset. Each row contains title and description for each dataset, the number of rows and columns, the organisms and genome builds involved, whether the dataset has any pre-computed reduced dimensions, and so on. @@ -67,7 +67,7 @@ def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, l def _format_query_results(results: list, key_names: list): - """Format the results from sqlite as a pandas dataframe. + """Format the results from sqlite as a BiocFrame. Key names must be in the exact same order as the query. """ @@ -85,7 +85,7 @@ def _format_query_results(results: list, key_names: list): def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "meta"): _all_paths = [None if "/" not in p else p.rsplit("/", 1)[0] for p in results["path"]] - df = pd.DataFrame( + df = BiocFrame( { "name": results["asset"], "version": results["version"], @@ -150,10 +150,10 @@ def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "met for meta in _all_metas: cursources = meta.get("sources") if cursources is None: - sources.append(pd.DataFrame(columns=["provider", "id", "version"])) + sources.append(BiocFrame(columns=["provider", "id", "version"])) else: sources.append( - pd.DataFrame( + BiocFrame( { "provider": [s.get("provider") for s in cursources], "id": [s.get("id") for s in cursources], diff --git a/src/scrnaseq/polish_dataset.py b/src/scrnaseq/polish_dataset.py index 3dff829..6778cc0 100644 --- a/src/scrnaseq/polish_dataset.py +++ b/src/scrnaseq/polish_dataset.py @@ -1,7 +1,6 @@ from typing import Type import numpy as np -from scipy import sparse as sp from singlecellexperiment import SingleCellExperiment from summarizedexperiment import SummarizedExperiment @@ -80,6 +79,8 @@ def _polish_dataset( for asyname, asy in x.assays.items(): if reformat_assay_by_density is not None: density = min(np.mean(asy != 0), np.mean(asy != np.nan)) + from scipy import sparse as sp + if density < reformat_assay_by_density: if not sp.issparse(asy): asy = sp.csr_matrix(asy) @@ -90,6 +91,8 @@ def _polish_dataset( if attempt_integer_conversion: if np.issubdtype(asy.dtype, np.floating): _cast = False + from scipy import sparse as sp + if sp.issparse(asy): if not np.any(asy.data % 1 != 0): _cast = True diff --git a/src/scrnaseq/search_datasets.py b/src/scrnaseq/search_datasets.py index b31a070..b19570e 100644 --- a/src/scrnaseq/search_datasets.py +++ b/src/scrnaseq/search_datasets.py @@ -2,7 +2,7 @@ from functools import lru_cache from typing import Union -import pandas as pd +from biocframe import BiocFrame from gypsum_client import cache_directory, fetch_metadata_database from gypsum_client.search_metadata import ( GypsumSearchClause, @@ -22,7 +22,7 @@ def search_datasets( cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True, -) -> pd.DataFrame: +) -> BiocFrame: """Search for datasets of interest based on matching text in the associated metadata. This is a wrapper around @@ -76,7 +76,7 @@ def search_datasets( Defaults to True. Returns: - A :py:class:`~pandas.DataFrame` where each row corresponds to + A :py:class:`~biocframe.BiocFrame` where each row corresponds to a dataset, containing various columns of metadata. Some columns may be lists to capture 1:many mappings. """ diff --git a/tests/test_list_dataset.py b/tests/test_list_dataset.py index 380ad46..7a53008 100644 --- a/tests/test_list_dataset.py +++ b/tests/test_list_dataset.py @@ -1,6 +1,6 @@ import tempfile -import pandas as pd +from biocframe import BiocFrame from scrnaseq import list_datasets __author__ = "Jayaram Kancherla" @@ -11,5 +11,5 @@ def test_list_dataset(): datasets = list_datasets(cache_dir=tempfile.mkdtemp()) - assert isinstance(datasets, pd.DataFrame) + assert isinstance(datasets, BiocFrame) assert len(datasets) > 80 diff --git a/tests/test_save_dataset.py b/tests/test_save_dataset.py index 97be19b..9bbbe6f 100644 --- a/tests/test_save_dataset.py +++ b/tests/test_save_dataset.py @@ -6,7 +6,6 @@ import anndata as ad import dolomite_base as dl import numpy as np -import pandas as pd import pytest from dolomite_matrix import ReloadedArray from scrnaseq import fetch_dataset, save_dataset diff --git a/tests/test_search_datasets.py b/tests/test_search_datasets.py index 309bfed..7fe857a 100644 --- a/tests/test_search_datasets.py +++ b/tests/test_search_datasets.py @@ -1,4 +1,4 @@ -import pandas as pd +from biocframe import BiocFrame from gypsum_client import define_text_query from scrnaseq import search_datasets @@ -10,14 +10,14 @@ def test_search_datasets(): res = search_datasets("brain") assert len(res) > 10 - assert isinstance(res, pd.DataFrame) + assert isinstance(res, BiocFrame) res = search_datasets(define_text_query("Neuro%", partial=True)) - assert isinstance(res, pd.DataFrame) + assert isinstance(res, BiocFrame) assert len(res) > 0 res = search_datasets(define_text_query("10090", field="taxonomy_id")) - assert isinstance(res, pd.DataFrame) + assert isinstance(res, BiocFrame) assert len(res) > 0 res = search_datasets( @@ -27,5 +27,5 @@ def test_search_datasets(): | define_text_query("pancrea%", partial=True) ) ) - assert isinstance(res, pd.DataFrame) + assert isinstance(res, BiocFrame) assert len(res) > 0 diff --git a/tests/test_upload_dataset.py b/tests/test_upload_dataset.py index f3794c7..6aac551 100644 --- a/tests/test_upload_dataset.py +++ b/tests/test_upload_dataset.py @@ -7,7 +7,7 @@ import dolomite_base as dl import dolomite_matrix as dlm import numpy as np -import pandas as pd +import datetime import pytest from biocframe import BiocFrame from gypsum_client import prepare_directory_upload @@ -139,7 +139,7 @@ def test_actual_upload_works_correctly(): app_url = "https://gypsum.artifactdb.com" - version = str(pd.Timestamp.today().date()) + version = str(datetime.datatime.now().date()) upload_dataset(tmp, "test", version, probation=True, url=app_url, token=gh_token) fetch_dataset.cache_clear() # Clear cache before fetching