Skip to content

Commit

Permalink
Add function to search for reference datasets (#4)
Browse files Browse the repository at this point in the history
  • Loading branch information
jkanche authored May 29, 2024
1 parent cf9e1e2 commit 4da1496
Show file tree
Hide file tree
Showing 7 changed files with 146 additions and 8 deletions.
1 change: 1 addition & 0 deletions src/celldex/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@
from .list_references import list_references
from .list_versions import fetch_latest_version, list_versions
from .save_reference import save_reference
from .search_references import search_references
8 changes: 4 additions & 4 deletions src/celldex/fetch_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,10 @@ def fetch_reference(
Args:
name:
Name of the dataset.
Name of the reference dataset.
version:
Version of the dataset.
Version of the reference dataset.
path:
Path to a subdataset, if name contains multiple datasets.
Expand Down Expand Up @@ -123,10 +123,10 @@ def fetch_metadata(
Args:
name:
Name of the dataset.
Name of the reference dataset.
version:
Version of the dataset.
Version of the reference dataset.
path:
Path to a subdataset, if name contains multiple datasets.
Expand Down
2 changes: 1 addition & 1 deletion src/celldex/list_versions.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def list_versions(name: str) -> List[str]:
Args:
name:
Name of the dataset.
Name of the reference dataset.
Returns:
A list of version names.
Expand Down
2 changes: 1 addition & 1 deletion src/celldex/save_reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

@singledispatch
def save_reference(x: Any, labels: List[str], path: str, metadata: dict):
"""Save a dataset to disk.
"""Save a reference dataset to disk.
Args:
x:
Expand Down
115 changes: 115 additions & 0 deletions src/celldex/search_references.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import sqlite3
from functools import lru_cache
from typing import Union

import pandas as pd
from gypsum_client import cache_directory, fetch_metadata_database
from gypsum_client.search_metadata import (
GypsumSearchClause,
search_metadata_text_filter,
)

from .list_references import _format_query_results, _sanitize_query_to_output

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


@lru_cache
def search_references(
query: Union[str, GypsumSearchClause],
cache_dir: str = cache_directory(),
overwrite: bool = False,
latest: bool = True,
) -> pd.DataFrame:
"""Search for reference datasets of interest based on matching text in the associated metadata.
This is a wrapper around
:py:func:`~gypsum_client.search_metadata.search_metadata_text`.
The returned :py:class:`~pandas.DataFrame` contains the usual
suspects like the title and description for each dataset, the
number of rows and columns, the organisms and genome builds
involved, whether the dataset has any pre-computed reduced
dimensions, and so on.
More details can be found in the Bioconductor
`metadata index <https://github.com/ArtifactDB/bioconductor-metadata-index>`_.
See Also:
:py:func:`~celldex.list_references.list_references`, to list all
available datasets.
:py:func:`~gypsum_client.search_metadata.search_metadata_text`,
to search metadata.
Examples:
.. code-block:: python
res = search_references("brain")
res = search_references(define_text_query("Neuro%", partial=True")
res = search_references(define_text_query("10090", field="taxonomy_id")
res = search_references(
define_text_query("GRCm38", field="genome") &
(define_text_query("neuro%", partial=True) |
define_text_query("pancrea%", partial=True))
)
Args:
query:
The search query string or a
:py:class:`~gypsum_client.search_metadata.GypsumSearchClause` for
more complex queries.
cache_directory:
Path to cache directory.
overwrite:
Whether to overwrite the existing cache.
Defaults to False.
latest:
Whether to fetch only the latest versions of datasets.
Defaults to True.
Returns:
A :py:class:`~pandas.DataFrame` where each row corresponds to
a dataset, containing various columns of metadata.
Some columns may be lists to capture 1:many mappings.
"""

bpath = fetch_metadata_database(cache_dir=cache_dir, overwrite=overwrite)

where = search_metadata_text_filter(query)
cond = where["where"]
params = where["parameters"]

conn = sqlite3.connect(bpath, check_same_thread=False)
stmt = "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path"
key_names = ["meta", "asset", "version", "path"]

if not latest:
stmt += ", versions.latest AS latest"
key_names.append("latest")

stmt += " FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'scRNAseq'"

if latest:
stmt += " AND versions.latest = 1"

if cond:
stmt += " AND " + " AND ".join(cond)
cursor = conn.execute(stmt, params)
else:
cursor = conn.execute(stmt)

_qresults = cursor.fetchall()
conn.close()

results = _format_query_results(_qresults, key_names)
return _sanitize_query_to_output(results, latest)
5 changes: 3 additions & 2 deletions src/celldex/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ def celldex_load_object(
Args:
path:
Path to the dataset.
Path to the reference dataset.
metadata:
Metadata for the dataset.
Metadata for the reference dataset.
Defaults to None.
celldex_realize_assays:
Expand Down
21 changes: 21 additions & 0 deletions tests/test_search_refs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd
from gypsum_client import define_text_query
from celldex import search_references

__author__ = "Jayaram Kancherla"
__copyright__ = "Jayaram Kancherla"
__license__ = "MIT"


def test_search_references():
res = search_references("brain")
assert len(res) > 10
assert isinstance(res, pd.DataFrame)

res = search_references(define_text_query("Neuro%", partial=True))
assert isinstance(res, pd.DataFrame)
assert len(res) > 0

res = search_references(define_text_query("10090", field="taxonomy_id"))
assert isinstance(res, pd.DataFrame)
assert len(res) > 0

0 comments on commit 4da1496

Please sign in to comment.