diff --git a/README.md b/README.md index 9fb5ded..2769cea 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,24 @@ datasets = scrnaseq.list_datasets() This returns a pandas `DataFrame` to easily filter and download datasets of interest. +Users can also search on the metadata text using the `search_datasets()` function. This accepts both simple text queries as well as more complicated expressions involving boolean operations. + +```python +# Find all datasets involving pancreas. +res = search_datasets("pancreas") + +# Find all mm10 datasets involving pancreas or neurons. +res = search_datasets( + define_text_query("GRCm38", field="genome") + & ( + define_text_query("neuro%", partial=True) + | define_text_query("pancrea%", partial=True) + ) +) +``` + +Search results are not guaranteed to be reproducible - more datasets may be added over time, and existing datasets may be updated with new versions. Once a dataset of interest is identified, users should explicitly list the name and version of the dataset in their scripts to ensure reproducibility. + ## Fetch Datasets The `fetch_dataset()` function will download a particular dataset, as `SingleCellExperiment`: @@ -61,6 +79,7 @@ Users can also fetch the metadata associated with each dataset: meta = scrnaseq.fetch_metadata("zeisel-brain-2015", "2023-12-14") ``` + ## Adding New Datasets Want to contribute your own dataset to this package? It's easy! Just follow these simple steps: diff --git a/setup.cfg b/setup.cfg index 1f2a991..773c848 100644 --- a/setup.cfg +++ b/setup.cfg @@ -52,7 +52,7 @@ install_requires = dolomite_base dolomite_matrix dolomite_sce>=0.1.2 - gypsum_client>=0.1.2 + gypsum_client>=0.1.3 delayedarray>=0.5.1 summarizedexperiment singlecellexperiment diff --git a/src/scrnaseq/__init__.py b/src/scrnaseq/__init__.py index 9441006..148216f 100644 --- a/src/scrnaseq/__init__.py +++ b/src/scrnaseq/__init__.py @@ -20,4 +20,5 @@ from .list_versions import fetch_latest_version, list_versions from .polish_dataset import polish_dataset from .save_dataset import save_dataset +from .search_datasets import search_datasets from .upload_dataset import upload_dataset diff --git a/src/scrnaseq/list_datasets.py b/src/scrnaseq/list_datasets.py index 1c0e8cf..8ef72ca 100644 --- a/src/scrnaseq/list_datasets.py +++ b/src/scrnaseq/list_datasets.py @@ -59,9 +59,10 @@ def list_datasets( stmt = f"{stmt} AND versions.latest = 1" _qresults = conn.execute(stmt).fetchall() - results = _format_query_results(_qresults, key_names) conn.close() + results = _format_query_results(_qresults, key_names) + return _sanitize_query_to_output(results, latest) diff --git a/src/scrnaseq/search_datasets.py b/src/scrnaseq/search_datasets.py new file mode 100644 index 0000000..5bc23f9 --- /dev/null +++ b/src/scrnaseq/search_datasets.py @@ -0,0 +1,114 @@ +import sqlite3 +from typing import Union + +import pandas as pd +from gypsum_client import cache_directory, fetch_metadata_database +from gypsum_client.search_metadata import ( + GypsumSearchClause, + define_text_query, + search_metadata_text, + search_metadata_text_filter, +) + +from .list_datasets import _format_query_results, _sanitize_query_to_output + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def search_datasets( + query: Union[str, GypsumSearchClause], + cache_dir: str = cache_directory(), + overwrite: bool = False, + latest: bool = True, +) -> pd.DataFrame: + """Search for datasets of interest based on matching text in the + associated metadata. + + This is a wrapper around + :py:func:`~gypsum_client.search_metadata.search_metadata_text`. + + The returned DataFrame contains the usual suspects like the title + and description for each dataset, the number of rows and columns, + the organisms and genome builds involved, whether the dataset has + any pre-computed reduced dimensions, and so on. + + More details can be found in the Bioconductor + `metadata index `_. + + See Also: + :py:func:`~scrnaseq.list_datasets.list_datasets`, to list all + available datasets. + + :py:func:`~gypsum_client.search_metadata.search_metadata_text`, + to search metadata. + + Examples: + + .. code-block:: python + + res = search_datasets("brain") + + res = search_datasets(define_text_query("Neuro%", partial=True") + + res = search_datasets(define_text_query("10090", field="taxonomy_id") + + res = search_datasets( + define_text_query("GRCm38", field="genome") & + (define_text_query("neuro%", partial=True) | + define_text_query("pancrea%", partial=True)) + ) + + Args: + query: + The search query string or a gypsum.search.object for + more complex queries. + + cache_directory: + Path to cache directory. + + overwrite: + Whether to overwrite the existing cache. + Defaults to False. + + latest: + Whether to fetch only the latest versions of datasets. + Defaults to True. + + Returns: + A :py:class:`~pandas.DataFrame` where each row corresponds to + a dataset, containing various columns of metadata. + Some columns may be lists to capture 1:many mappings. + """ + + bpath = fetch_metadata_database(cache_dir=cache_dir, overwrite=overwrite) + + where = search_metadata_text_filter(query) + cond = where["where"] + params = where["parameters"] + + conn = sqlite3.connect(bpath, check_same_thread=False) + stmt = "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path" + key_names = ["meta", "asset", "version", "path"] + + if not latest: + stmt += ", versions.latest AS latest" + key_names.append("latest") + + stmt += " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'scRNAseq'" + + if latest: + stmt += " AND versions.latest = 1" + + if cond: + stmt += " AND " + " AND ".join(cond) + cursor = conn.execute(stmt, params) + else: + cursor = conn.execute(stmt) + + _qresults = cursor.fetchall() + conn.close() + + results = _format_query_results(_qresults, key_names) + return _sanitize_query_to_output(results, latest) diff --git a/tests/test_search_datasets.py b/tests/test_search_datasets.py new file mode 100644 index 0000000..309bfed --- /dev/null +++ b/tests/test_search_datasets.py @@ -0,0 +1,31 @@ +import pandas as pd +from gypsum_client import define_text_query +from scrnaseq import search_datasets + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_search_datasets(): + res = search_datasets("brain") + assert len(res) > 10 + assert isinstance(res, pd.DataFrame) + + res = search_datasets(define_text_query("Neuro%", partial=True)) + assert isinstance(res, pd.DataFrame) + assert len(res) > 0 + + res = search_datasets(define_text_query("10090", field="taxonomy_id")) + assert isinstance(res, pd.DataFrame) + assert len(res) > 0 + + res = search_datasets( + define_text_query("GRCm38", field="genome") + & ( + define_text_query("neuro%", partial=True) + | define_text_query("pancrea%", partial=True) + ) + ) + assert isinstance(res, pd.DataFrame) + assert len(res) > 0