Skip to content

Commit

Permalink
Integrate with BiocFrame to access search results (#21)
Browse files Browse the repository at this point in the history
- Remove pandas dependency
- Make scipy optional
  • Loading branch information
jkanche authored Jan 2, 2025
1 parent d90ab64 commit 815f2ab
Show file tree
Hide file tree
Showing 10 changed files with 30 additions and 23 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Changelog

## Version 0.3.1

- Make pandas optional. List and search functions now return a `BiocFrame` object.
- Since scipy is only used during upload, the package loads it dynamically and makes it optional.

## Version 0.3.0

- chore: Remove Python 3.8 (EOL).
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ print(datasets[["name", "version"]].head(3))
# | 2 | zhong-prefrontal-2018 | 2023-12-22 |
```

This returns a pandas `DataFrame` to easily filter and download datasets of interest.
This returns a `BiocFrame` to easily filter and download datasets of interest.

Users can also search on the metadata text using the `search_datasets()` function. This accepts both simple text queries as well as more complicated expressions involving boolean operations.

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ install_requires =
delayedarray>=0.5.1
summarizedexperiment
singlecellexperiment
pandas

[options.packages.find]
where = src
Expand All @@ -69,6 +68,7 @@ exclude =
# PDF = ReportLab; RXP
optional =
anndata
scipy

# Add here test requirements (semicolon/line-separated)
testing =
Expand Down
14 changes: 7 additions & 7 deletions src/scrnaseq/list_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import sqlite3
from functools import lru_cache

import pandas as pd
from biocframe import BiocFrame
from gypsum_client import (
cache_directory,
fetch_metadata_database,
Expand All @@ -14,7 +14,7 @@


@lru_cache
def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> pd.DataFrame:
def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True) -> BiocFrame:
"""List all available datasets.
Example:
Expand All @@ -38,7 +38,7 @@ def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, l
Defaults to True.
Returns:
A :py:class:`~pandas.DataFrame` where each row corresponds to a dataset.
A :py:class:`~biocframe.BiocFrame` where each row corresponds to a dataset.
Each row contains title and description for each dataset,
the number of rows and columns, the organisms and genome builds involved,
whether the dataset has any pre-computed reduced dimensions, and so on.
Expand Down Expand Up @@ -67,7 +67,7 @@ def list_datasets(cache_dir: str = cache_directory(), overwrite: bool = False, l


def _format_query_results(results: list, key_names: list):
"""Format the results from sqlite as a pandas dataframe.
"""Format the results from sqlite as a BiocFrame.
Key names must be in the exact same order as the query.
"""
Expand All @@ -85,7 +85,7 @@ def _format_query_results(results: list, key_names: list):
def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "meta"):
_all_paths = [None if "/" not in p else p.rsplit("/", 1)[0] for p in results["path"]]

df = pd.DataFrame(
df = BiocFrame(
{
"name": results["asset"],
"version": results["version"],
Expand Down Expand Up @@ -150,10 +150,10 @@ def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "met
for meta in _all_metas:
cursources = meta.get("sources")
if cursources is None:
sources.append(pd.DataFrame(columns=["provider", "id", "version"]))
sources.append(BiocFrame(columns=["provider", "id", "version"]))
else:
sources.append(
pd.DataFrame(
BiocFrame(
{
"provider": [s.get("provider") for s in cursources],
"id": [s.get("id") for s in cursources],
Expand Down
5 changes: 4 additions & 1 deletion src/scrnaseq/polish_dataset.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Type

import numpy as np
from scipy import sparse as sp
from singlecellexperiment import SingleCellExperiment
from summarizedexperiment import SummarizedExperiment

Expand Down Expand Up @@ -80,6 +79,8 @@ def _polish_dataset(
for asyname, asy in x.assays.items():
if reformat_assay_by_density is not None:
density = min(np.mean(asy != 0), np.mean(asy != np.nan))
from scipy import sparse as sp

if density < reformat_assay_by_density:
if not sp.issparse(asy):
asy = sp.csr_matrix(asy)
Expand All @@ -90,6 +91,8 @@ def _polish_dataset(
if attempt_integer_conversion:
if np.issubdtype(asy.dtype, np.floating):
_cast = False
from scipy import sparse as sp

if sp.issparse(asy):
if not np.any(asy.data % 1 != 0):
_cast = True
Expand Down
6 changes: 3 additions & 3 deletions src/scrnaseq/search_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from functools import lru_cache
from typing import Union

import pandas as pd
from biocframe import BiocFrame
from gypsum_client import cache_directory, fetch_metadata_database
from gypsum_client.search_metadata import (
GypsumSearchClause,
Expand All @@ -22,7 +22,7 @@ def search_datasets(
cache_dir: str = cache_directory(),
overwrite: bool = False,
latest: bool = True,
) -> pd.DataFrame:
) -> BiocFrame:
"""Search for datasets of interest based on matching text in the associated metadata.
This is a wrapper around
Expand Down Expand Up @@ -76,7 +76,7 @@ def search_datasets(
Defaults to True.
Returns:
A :py:class:`~pandas.DataFrame` where each row corresponds to
A :py:class:`~biocframe.BiocFrame` where each row corresponds to
a dataset, containing various columns of metadata.
Some columns may be lists to capture 1:many mappings.
"""
Expand Down
4 changes: 2 additions & 2 deletions tests/test_list_dataset.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import tempfile

import pandas as pd
from biocframe import BiocFrame
from scrnaseq import list_datasets

__author__ = "Jayaram Kancherla"
Expand All @@ -11,5 +11,5 @@
def test_list_dataset():
datasets = list_datasets(cache_dir=tempfile.mkdtemp())

assert isinstance(datasets, pd.DataFrame)
assert isinstance(datasets, BiocFrame)
assert len(datasets) > 80
1 change: 0 additions & 1 deletion tests/test_save_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import anndata as ad
import dolomite_base as dl
import numpy as np
import pandas as pd
import pytest
from dolomite_matrix import ReloadedArray
from scrnaseq import fetch_dataset, save_dataset
Expand Down
10 changes: 5 additions & 5 deletions tests/test_search_datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import pandas as pd
from biocframe import BiocFrame
from gypsum_client import define_text_query
from scrnaseq import search_datasets

Expand All @@ -10,14 +10,14 @@
def test_search_datasets():
res = search_datasets("brain")
assert len(res) > 10
assert isinstance(res, pd.DataFrame)
assert isinstance(res, BiocFrame)

res = search_datasets(define_text_query("Neuro%", partial=True))
assert isinstance(res, pd.DataFrame)
assert isinstance(res, BiocFrame)
assert len(res) > 0

res = search_datasets(define_text_query("10090", field="taxonomy_id"))
assert isinstance(res, pd.DataFrame)
assert isinstance(res, BiocFrame)
assert len(res) > 0

res = search_datasets(
Expand All @@ -27,5 +27,5 @@ def test_search_datasets():
| define_text_query("pancrea%", partial=True)
)
)
assert isinstance(res, pd.DataFrame)
assert isinstance(res, BiocFrame)
assert len(res) > 0
4 changes: 2 additions & 2 deletions tests/test_upload_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import dolomite_base as dl
import dolomite_matrix as dlm
import numpy as np
import pandas as pd
import datetime
import pytest
from biocframe import BiocFrame
from gypsum_client import prepare_directory_upload
Expand Down Expand Up @@ -139,7 +139,7 @@ def test_actual_upload_works_correctly():

app_url = "https://gypsum.artifactdb.com"

version = str(pd.Timestamp.today().date())
version = str(datetime.datatime.now().date())
upload_dataset(tmp, "test", version, probation=True, url=app_url, token=gh_token)
fetch_dataset.cache_clear() # Clear cache before fetching

Expand Down

0 comments on commit 815f2ab

Please sign in to comment.