Add search dataset functions (#7)

* bump package version for gypsum_client * update docstrings and README
BiocPy · May 28, 2024 · fe72510 · fe72510
1 parent 8ba06ad
commit fe72510
Show file tree

Hide file tree

Showing 6 changed files with 168 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -32,6 +32,24 @@ datasets = scrnaseq.list_datasets()
 
 This returns a pandas `DataFrame` to easily filter and download datasets of interest.
 
+Users can also search on the metadata text using the `search_datasets()` function. This accepts both simple text queries as well as more complicated expressions involving boolean operations.
+
+```python
+# Find all datasets involving pancreas.
+res = search_datasets("pancreas")
+
+# Find all mm10 datasets involving pancreas or neurons.
+res = search_datasets(
+     define_text_query("GRCm38", field="genome")
+     & (
+          define_text_query("neuro%", partial=True)
+          | define_text_query("pancrea%", partial=True)
+     )
+)
+```
+
+Search results are not guaranteed to be reproducible - more datasets may be added over time, and existing datasets may be updated with new versions. Once a dataset of interest is identified, users should explicitly list the name and version of the dataset in their scripts to ensure reproducibility.
+
 ## Fetch Datasets
 
 The `fetch_dataset()` function will download a particular dataset, as `SingleCellExperiment`:
@@ -61,6 +79,7 @@ Users can also fetch the metadata associated with each dataset:
 meta = scrnaseq.fetch_metadata("zeisel-brain-2015", "2023-12-14")
 ```
 
+
 ## Adding New Datasets
 
 Want to contribute your own dataset to this package? It's easy! Just follow these simple steps:

diff --git a/setup.cfg b/setup.cfg
@@ -52,7 +52,7 @@ install_requires =
     dolomite_base
     dolomite_matrix
     dolomite_sce>=0.1.2
-    gypsum_client>=0.1.2
+    gypsum_client>=0.1.3
     delayedarray>=0.5.1
     summarizedexperiment
     singlecellexperiment

diff --git a/src/scrnaseq/__init__.py b/src/scrnaseq/__init__.py
@@ -20,4 +20,5 @@
 from .list_versions import fetch_latest_version, list_versions
 from .polish_dataset import polish_dataset
 from .save_dataset import save_dataset
+from .search_datasets import search_datasets
 from .upload_dataset import upload_dataset
diff --git a/src/scrnaseq/list_datasets.py b/src/scrnaseq/list_datasets.py
@@ -59,9 +59,10 @@ def list_datasets(
         stmt = f"{stmt} AND versions.latest = 1"
 
     _qresults = conn.execute(stmt).fetchall()
-    results = _format_query_results(_qresults, key_names)
     conn.close()
 
+    results = _format_query_results(_qresults, key_names)
+
     return _sanitize_query_to_output(results, latest)
 
 

diff --git a/src/scrnaseq/search_datasets.py b/src/scrnaseq/search_datasets.py
@@ -0,0 +1,114 @@
+import sqlite3
+from typing import Union
+
+import pandas as pd
+from gypsum_client import cache_directory, fetch_metadata_database
+from gypsum_client.search_metadata import (
+    GypsumSearchClause,
+    define_text_query,
+    search_metadata_text,
+    search_metadata_text_filter,
+)
+
+from .list_datasets import _format_query_results, _sanitize_query_to_output
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+
+def search_datasets(
+    query: Union[str, GypsumSearchClause],
+    cache_dir: str = cache_directory(),
+    overwrite: bool = False,
+    latest: bool = True,
+) -> pd.DataFrame:
+    """Search for datasets of interest based on matching text in the
+    associated metadata.
+
+    This is a wrapper around
+    :py:func:`~gypsum_client.search_metadata.search_metadata_text`.
+
+    The returned DataFrame contains the usual suspects like the title
+    and description for each dataset, the number of rows and columns,
+    the organisms and genome builds involved, whether the dataset has
+    any pre-computed reduced dimensions, and so on.
+
+    More details can be found in the Bioconductor
+    `metadata index <https://github.com/ArtifactDB/bioconductor-metadata-index>`_.
+
+    See Also:
+        :py:func:`~scrnaseq.list_datasets.list_datasets`, to list all
+        available datasets.
+
+        :py:func:`~gypsum_client.search_metadata.search_metadata_text`,
+        to search metadata.
+
+    Examples:
+
+    .. code-block:: python
+
+        res = search_datasets("brain")
+
+        res = search_datasets(define_text_query("Neuro%", partial=True")
+
+        res = search_datasets(define_text_query("10090", field="taxonomy_id")
+
+        res = search_datasets(
+            define_text_query("GRCm38", field="genome") &
+            (define_text_query("neuro%", partial=True) |
+                define_text_query("pancrea%", partial=True))
+        )
+
+    Args:
+        query:
+            The search query string or a gypsum.search.object for
+            more complex queries.
+
+        cache_directory:
+            Path to cache directory.
+
+        overwrite:
+            Whether to overwrite the existing cache.
+            Defaults to False.
+
+        latest:
+            Whether to fetch only the latest versions of datasets.
+            Defaults to True.
+
+    Returns:
+        A :py:class:`~pandas.DataFrame` where each row corresponds to
+        a dataset, containing various columns of metadata.
+        Some columns may be lists to capture 1:many mappings.
+    """
+
+    bpath = fetch_metadata_database(cache_dir=cache_dir, overwrite=overwrite)
+
+    where = search_metadata_text_filter(query)
+    cond = where["where"]
+    params = where["parameters"]
+
+    conn = sqlite3.connect(bpath, check_same_thread=False)
+    stmt = "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path"
+    key_names = ["meta", "asset", "version", "path"]
+
+    if not latest:
+        stmt += ", versions.latest AS latest"
+        key_names.append("latest")
+
+    stmt += " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'scRNAseq'"
+
+    if latest:
+        stmt += " AND versions.latest = 1"
+
+    if cond:
+        stmt += " AND " + " AND ".join(cond)
+        cursor = conn.execute(stmt, params)
+    else:
+        cursor = conn.execute(stmt)
+
+    _qresults = cursor.fetchall()
+    conn.close()
+
+    results = _format_query_results(_qresults, key_names)
+    return _sanitize_query_to_output(results, latest)
diff --git a/tests/test_search_datasets.py b/tests/test_search_datasets.py
@@ -0,0 +1,31 @@
+import pandas as pd
+from gypsum_client import define_text_query
+from scrnaseq import search_datasets
+
+__author__ = "Jayaram Kancherla"
+__copyright__ = "Jayaram Kancherla"
+__license__ = "MIT"
+
+
+def test_search_datasets():
+    res = search_datasets("brain")
+    assert len(res) > 10
+    assert isinstance(res, pd.DataFrame)
+
+    res = search_datasets(define_text_query("Neuro%", partial=True))
+    assert isinstance(res, pd.DataFrame)
+    assert len(res) > 0
+
+    res = search_datasets(define_text_query("10090", field="taxonomy_id"))
+    assert isinstance(res, pd.DataFrame)
+    assert len(res) > 0
+
+    res = search_datasets(
+        define_text_query("GRCm38", field="genome")
+        & (
+            define_text_query("neuro%", partial=True)
+            | define_text_query("pancrea%", partial=True)
+        )
+    )
+    assert isinstance(res, pd.DataFrame)
+    assert len(res) > 0