Skip to content

Commit

Permalink
Add search dataset functions (#7)
Browse files Browse the repository at this point in the history
* bump package version for gypsum_client
* update docstrings and README
  • Loading branch information
jkanche authored May 28, 2024
1 parent 8ba06ad commit fe72510
Show file tree
Hide file tree
Showing 6 changed files with 168 additions and 2 deletions.
19 changes: 19 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,24 @@ datasets = scrnaseq.list_datasets()

This returns a pandas `DataFrame` to easily filter and download datasets of interest.

Users can also search on the metadata text using the `search_datasets()` function. This accepts both simple text queries as well as more complicated expressions involving boolean operations.

```python
# Find all datasets involving pancreas.
res = search_datasets("pancreas")

# Find all mm10 datasets involving pancreas or neurons.
res = search_datasets(
define_text_query("GRCm38", field="genome")
& (
define_text_query("neuro%", partial=True)
| define_text_query("pancrea%", partial=True)
)
)
```

Search results are not guaranteed to be reproducible - more datasets may be added over time, and existing datasets may be updated with new versions. Once a dataset of interest is identified, users should explicitly list the name and version of the dataset in their scripts to ensure reproducibility.

## Fetch Datasets

The `fetch_dataset()` function will download a particular dataset, as `SingleCellExperiment`:
Expand Down Expand Up @@ -61,6 +79,7 @@ Users can also fetch the metadata associated with each dataset:
meta = scrnaseq.fetch_metadata("zeisel-brain-2015", "2023-12-14")
```


## Adding New Datasets

Want to contribute your own dataset to this package? It's easy! Just follow these simple steps:
Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ install_requires =
dolomite_base
dolomite_matrix
dolomite_sce>=0.1.2
gypsum_client>=0.1.2
gypsum_client>=0.1.3
delayedarray>=0.5.1
summarizedexperiment
singlecellexperiment
Expand Down
1 change: 1 addition & 0 deletions src/scrnaseq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@
from .list_versions import fetch_latest_version, list_versions
from .polish_dataset import polish_dataset
from .save_dataset import save_dataset
from .search_datasets import search_datasets
from .upload_dataset import upload_dataset
3 changes: 2 additions & 1 deletion src/scrnaseq/list_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,10 @@ def list_datasets(
stmt = f"{stmt} AND versions.latest = 1"

_qresults = conn.execute(stmt).fetchall()
results = _format_query_results(_qresults, key_names)
conn.close()

results = _format_query_results(_qresults, key_names)

return _sanitize_query_to_output(results, latest)


Expand Down
114 changes: 114 additions & 0 deletions src/scrnaseq/search_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
import sqlite3
from typing import Union

import pandas as pd
from gypsum_client import cache_directory, fetch_metadata_database
from gypsum_client.search_metadata import (
GypsumSearchClause,
define_text_query,
search_metadata_text,
search_metadata_text_filter,
)

from .list_datasets import _format_query_results, _sanitize_query_to_output

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


def search_datasets(
query: Union[str, GypsumSearchClause],
cache_dir: str = cache_directory(),
overwrite: bool = False,
latest: bool = True,
) -> pd.DataFrame:
"""Search for datasets of interest based on matching text in the
associated metadata.
This is a wrapper around
:py:func:`~gypsum_client.search_metadata.search_metadata_text`.
The returned DataFrame contains the usual suspects like the title
and description for each dataset, the number of rows and columns,
the organisms and genome builds involved, whether the dataset has
any pre-computed reduced dimensions, and so on.
More details can be found in the Bioconductor
`metadata index <https://github.com/ArtifactDB/bioconductor-metadata-index>`_.
See Also:
:py:func:`~scrnaseq.list_datasets.list_datasets`, to list all
available datasets.
:py:func:`~gypsum_client.search_metadata.search_metadata_text`,
to search metadata.
Examples:
.. code-block:: python
res = search_datasets("brain")
res = search_datasets(define_text_query("Neuro%", partial=True")
res = search_datasets(define_text_query("10090", field="taxonomy_id")
res = search_datasets(
define_text_query("GRCm38", field="genome") &
(define_text_query("neuro%", partial=True) |
define_text_query("pancrea%", partial=True))
)
Args:
query:
The search query string or a gypsum.search.object for
more complex queries.
cache_directory:
Path to cache directory.
overwrite:
Whether to overwrite the existing cache.
Defaults to False.
latest:
Whether to fetch only the latest versions of datasets.
Defaults to True.
Returns:
A :py:class:`~pandas.DataFrame` where each row corresponds to
a dataset, containing various columns of metadata.
Some columns may be lists to capture 1:many mappings.
"""

bpath = fetch_metadata_database(cache_dir=cache_dir, overwrite=overwrite)

where = search_metadata_text_filter(query)
cond = where["where"]
params = where["parameters"]

conn = sqlite3.connect(bpath, check_same_thread=False)
stmt = "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path"
key_names = ["meta", "asset", "version", "path"]

if not latest:
stmt += ", versions.latest AS latest"
key_names.append("latest")

stmt += " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'scRNAseq'"

if latest:
stmt += " AND versions.latest = 1"

if cond:
stmt += " AND " + " AND ".join(cond)
cursor = conn.execute(stmt, params)
else:
cursor = conn.execute(stmt)

_qresults = cursor.fetchall()
conn.close()

results = _format_query_results(_qresults, key_names)
return _sanitize_query_to_output(results, latest)
31 changes: 31 additions & 0 deletions tests/test_search_datasets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import pandas as pd
from gypsum_client import define_text_query
from scrnaseq import search_datasets

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


def test_search_datasets():
res = search_datasets("brain")
assert len(res) > 10
assert isinstance(res, pd.DataFrame)

res = search_datasets(define_text_query("Neuro%", partial=True))
assert isinstance(res, pd.DataFrame)
assert len(res) > 0

res = search_datasets(define_text_query("10090", field="taxonomy_id"))
assert isinstance(res, pd.DataFrame)
assert len(res) > 0

res = search_datasets(
define_text_query("GRCm38", field="genome")
& (
define_text_query("neuro%", partial=True)
| define_text_query("pancrea%", partial=True)
)
)
assert isinstance(res, pd.DataFrame)
assert len(res) > 0

0 comments on commit fe72510

Please sign in to comment.