diff --git a/src/celldex/__init__.py b/src/celldex/__init__.py index 83b951e..2ac565f 100644 --- a/src/celldex/__init__.py +++ b/src/celldex/__init__.py @@ -19,3 +19,4 @@ from .list_references import list_references from .list_versions import fetch_latest_version, list_versions from .save_reference import save_reference +from .search_references import search_references diff --git a/src/celldex/fetch_reference.py b/src/celldex/fetch_reference.py index 3a246b4..b3b7a9c 100644 --- a/src/celldex/fetch_reference.py +++ b/src/celldex/fetch_reference.py @@ -47,10 +47,10 @@ def fetch_reference( Args: name: - Name of the dataset. + Name of the reference dataset. version: - Version of the dataset. + Version of the reference dataset. path: Path to a subdataset, if name contains multiple datasets. @@ -123,10 +123,10 @@ def fetch_metadata( Args: name: - Name of the dataset. + Name of the reference dataset. version: - Version of the dataset. + Version of the reference dataset. path: Path to a subdataset, if name contains multiple datasets. diff --git a/src/celldex/list_versions.py b/src/celldex/list_versions.py index 65398e6..f749f82 100644 --- a/src/celldex/list_versions.py +++ b/src/celldex/list_versions.py @@ -18,7 +18,7 @@ def list_versions(name: str) -> List[str]: Args: name: - Name of the dataset. + Name of the reference dataset. Returns: A list of version names. diff --git a/src/celldex/save_reference.py b/src/celldex/save_reference.py index 7a96c4a..b81569d 100644 --- a/src/celldex/save_reference.py +++ b/src/celldex/save_reference.py @@ -18,7 +18,7 @@ @singledispatch def save_reference(x: Any, labels: List[str], path: str, metadata: dict): - """Save a dataset to disk. + """Save a reference dataset to disk. Args: x: diff --git a/src/celldex/search_references.py b/src/celldex/search_references.py new file mode 100644 index 0000000..c87634b --- /dev/null +++ b/src/celldex/search_references.py @@ -0,0 +1,115 @@ +import sqlite3 +from functools import lru_cache +from typing import Union + +import pandas as pd +from gypsum_client import cache_directory, fetch_metadata_database +from gypsum_client.search_metadata import ( + GypsumSearchClause, + search_metadata_text_filter, +) + +from .list_references import _format_query_results, _sanitize_query_to_output + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@lru_cache +def search_references( + query: Union[str, GypsumSearchClause], + cache_dir: str = cache_directory(), + overwrite: bool = False, + latest: bool = True, +) -> pd.DataFrame: + """Search for reference datasets of interest based on matching text in the associated metadata. + + This is a wrapper around + :py:func:`~gypsum_client.search_metadata.search_metadata_text`. + + The returned :py:class:`~pandas.DataFrame` contains the usual + suspects like the title and description for each dataset, the + number of rows and columns, the organisms and genome builds + involved, whether the dataset has any pre-computed reduced + dimensions, and so on. + + More details can be found in the Bioconductor + `metadata index `_. + + See Also: + :py:func:`~celldex.list_references.list_references`, to list all + available datasets. + + :py:func:`~gypsum_client.search_metadata.search_metadata_text`, + to search metadata. + + Examples: + + .. code-block:: python + + res = search_references("brain") + + res = search_references(define_text_query("Neuro%", partial=True") + + res = search_references(define_text_query("10090", field="taxonomy_id") + + res = search_references( + define_text_query("GRCm38", field="genome") & + (define_text_query("neuro%", partial=True) | + define_text_query("pancrea%", partial=True)) + ) + + Args: + query: + The search query string or a + :py:class:`~gypsum_client.search_metadata.GypsumSearchClause` for + more complex queries. + + cache_directory: + Path to cache directory. + + overwrite: + Whether to overwrite the existing cache. + Defaults to False. + + latest: + Whether to fetch only the latest versions of datasets. + Defaults to True. + + Returns: + A :py:class:`~pandas.DataFrame` where each row corresponds to + a dataset, containing various columns of metadata. + Some columns may be lists to capture 1:many mappings. + """ + + bpath = fetch_metadata_database(cache_dir=cache_dir, overwrite=overwrite) + + where = search_metadata_text_filter(query) + cond = where["where"] + params = where["parameters"] + + conn = sqlite3.connect(bpath, check_same_thread=False) + stmt = "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path" + key_names = ["meta", "asset", "version", "path"] + + if not latest: + stmt += ", versions.latest AS latest" + key_names.append("latest") + + stmt += " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'scRNAseq'" + + if latest: + stmt += " AND versions.latest = 1" + + if cond: + stmt += " AND " + " AND ".join(cond) + cursor = conn.execute(stmt, params) + else: + cursor = conn.execute(stmt) + + _qresults = cursor.fetchall() + conn.close() + + results = _format_query_results(_qresults, key_names) + return _sanitize_query_to_output(results, latest) diff --git a/src/celldex/utils.py b/src/celldex/utils.py index b6fe152..d3c7356 100644 --- a/src/celldex/utils.py +++ b/src/celldex/utils.py @@ -18,10 +18,11 @@ def celldex_load_object( Args: path: - Path to the dataset. + Path to the reference dataset. metadata: - Metadata for the dataset. + Metadata for the reference dataset. + Defaults to None. celldex_realize_assays: diff --git a/tests/test_search_refs.py b/tests/test_search_refs.py new file mode 100644 index 0000000..9eac96d --- /dev/null +++ b/tests/test_search_refs.py @@ -0,0 +1,21 @@ +import pandas as pd +from gypsum_client import define_text_query +from celldex import search_references + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_search_references(): + res = search_references("brain") + assert len(res) > 10 + assert isinstance(res, pd.DataFrame) + + res = search_references(define_text_query("Neuro%", partial=True)) + assert isinstance(res, pd.DataFrame) + assert len(res) > 0 + + res = search_references(define_text_query("10090", field="taxonomy_id")) + assert isinstance(res, pd.DataFrame) + assert len(res) > 0