diff --git a/.github/workflows/pypi-test.yml b/.github/workflows/pypi-test.yml index f35fd23..8f58241 100644 --- a/.github/workflows/pypi-test.yml +++ b/.github/workflows/pypi-test.yml @@ -2,36 +2,54 @@ name: Test the library on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + branches: [master] jobs: build: - runs-on: ubuntu-latest strategy: matrix: - python-version: [ '3.8', '3.9', '3.10', '3.11', '3.12' ] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] name: Python ${{ matrix.python-version }} steps: - - uses: actions/checkout@v2 - - name: Setup Python - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - cache: 'pip' - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pytest tox - # - name: Lint with flake8 - # run: | - # # stop the build if there are Python syntax errors or undefined names - # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - # # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with tox - run: | - tox + - uses: actions/checkout@v2 + - name: Setup Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + + # - name: Update SQLite3 + # run: sudo apt install -y sqlite3 + + # build SQLite from source, because I need 3.35<= + - run: | + wget https://www.sqlite.org/2024/sqlite-autoconf-3450300.tar.gz + tar -xvf sqlite-autoconf-3450300.tar.gz + - run: | + ./configure + make + sudo make install + export PATH="/usr/local/lib:$PATH" + working-directory: sqlite-autoconf-3450300 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest tox + env: + LD_LIBRARY_PATH: /usr/local/lib + # - name: Lint with flake8 + # run: | + # # stop the build if there are Python syntax errors or undefined names + # flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + # # flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with tox + run: | + tox + env: + LD_LIBRARY_PATH: /usr/local/lib diff --git a/setup.cfg b/setup.cfg index 9f4d887..013c9e6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -51,11 +51,12 @@ install_requires = importlib-metadata; python_version<"3.8" dolomite_base dolomite_matrix - dolomite_sce + dolomite_sce>=0.1.2 gypsum_client>=0.1.1 delayedarray summarizedexperiment singlecellexperiment + pandas [options.packages.find] where = src @@ -66,12 +67,16 @@ exclude = # Add here additional requirements for extra features, to install with: # `pip install scrnaseq[PDF]` like: # PDF = ReportLab; RXP +optional = + anndata # Add here test requirements (semicolon/line-separated) testing = setuptools pytest pytest-cov + scipy + anndata [options.entry_points] # Add here console scripts like: diff --git a/src/scrnaseq/__init__.py b/src/scrnaseq/__init__.py index 5915d6e..254d666 100644 --- a/src/scrnaseq/__init__.py +++ b/src/scrnaseq/__init__.py @@ -15,4 +15,7 @@ finally: del version, PackageNotFoundError -from .fetch_dataset import fetch_dataset, fetch_metadata \ No newline at end of file +from .fetch_dataset import fetch_dataset, fetch_metadata +from .list_datasets import list_datasets +from .list_versions import fetch_latest_version, list_versions +from .save_dataset import save_dataset \ No newline at end of file diff --git a/src/scrnaseq/fetch_dataset.py b/src/scrnaseq/fetch_dataset.py index 02c4ccc..ada68ff 100644 --- a/src/scrnaseq/fetch_dataset.py +++ b/src/scrnaseq/fetch_dataset.py @@ -2,12 +2,12 @@ import json import os -from delayedarray import is_sparse, to_dense_array, to_scipy_sparse_matrix -from dolomite_base import alt_read_object, alt_read_object_function, read_object +from dolomite_base import alt_read_object, alt_read_object_function from gypsum_client import cache_directory, save_file, save_version -from singlecellexperiment import SingleCellExperiment from summarizedexperiment import SummarizedExperiment +from .utils import single_cell_load_object + __author__ = "Jayaram Kancherla" __copyright__ = "Jayaram Kancherla" __license__ = "MIT" @@ -24,7 +24,24 @@ def fetch_dataset( realize_reduced_dims: bool = True, **kwargs, ) -> SummarizedExperiment: - """Fetch a dataset from the gypsum backend. + """Fetch a single-cell dataset from the gypsum backend. + + See Also: + `metadata index `_, + on the expected schema for the metadata. + + :py:func:`~scrnaseq.save_dataset.save_dataset` and + :py:func:`~gypsum_client.upload_file_operations.upload_directory`, + to save and upload a dataset. + + :py:func:`~scrnaseq.survey_datasets.survey_datasets` and :py:func:`~scrnaseq.list_versions.list_versions`, + to get possible values for `name` and `version`. + + Example: + + .. code-block:: python + + sce = fetch_dataset("zeisel-brain-2015", "2023-12-14") Args: name: @@ -99,6 +116,16 @@ def fetch_metadata( ): """Fetch metadata for a dataset from the gypsum backend. + See Also: + :py:func:`~.fetch_dataset`, + to fetch a dataset. + + Example: + + .. code-block:: python + + meta = fetch_metadata("zeisel-brain-2015", "2023-12-14") + Args: name: Name of the dataset. @@ -133,85 +160,3 @@ def fetch_metadata( metadata = json.load(f) return metadata - - -def single_cell_load_object( - path: str, - metadata: dict = None, - scrnaseq_realize_assays: bool = False, - scrnaseq_realize_reduced_dims: bool = True, - **kwargs, -): - """Load a ``SummarizedExperiment`` or ``SingleCellExperiment`` object from a file. - - Args: - path: - Path to the dataset. - - metadata: - Metadata for the dataset. - Defaults to None. - - scrnaseq_realize_assays: - Whether to realize assays into memory. - Defaults to False. - - scrnaseq_realize_reduced_dims: - Whether to realize reduced dimensions into memory. - Defaults to True. - - **kwargs: - Further arguments to pass to - :py:func:`~dolomite_base.read_object.read_object`. - - Returns: - A `SummarizedExperiment` of the object. - """ - obj = read_object( - path, - metadata=metadata, - scrnaseq_realize_assays=scrnaseq_realize_assays, - scrnaseq_realize_reduced_dims=scrnaseq_realize_reduced_dims, - **kwargs, - ) - - if isinstance(obj, SummarizedExperiment): - if scrnaseq_realize_assays: - _assays = {} - for y in obj.get_assay_names(): - _assays[y] = realize_array(obj.assay(y)) - - obj = obj.set_assays(_assays) - - if isinstance(obj, SingleCellExperiment): - if scrnaseq_realize_reduced_dims: - _red_dims = {} - for z in obj.get_reduced_dim_names(): - _red_dims[z] = realize_array(obj.reduced_dim(z)) - - obj = obj.set_reduced_dims(_red_dims) - - return obj - - -def realize_array(x): - """ - Realize a `ReloadedArray` into a dense array or sparse matrix. - - Args: - x: - `ReloadedArray` object. - - Returns: - - Realized array or matrix. - """ - from dolomite_matrix import ReloadedArray - - if isinstance(x, ReloadedArray): - if is_sparse(x): - x = to_scipy_sparse_matrix(x, "csr") - else: - x = to_dense_array(x) - - return x diff --git a/src/scrnaseq/list_datasets.py b/src/scrnaseq/list_datasets.py new file mode 100644 index 0000000..df86038 --- /dev/null +++ b/src/scrnaseq/list_datasets.py @@ -0,0 +1,194 @@ +import json +import sqlite3 +from functools import lru_cache + +import pandas as pd +from gypsum_client import ( + cache_directory, + fetch_metadata_database, +) + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@lru_cache +def list_datasets( + cache_dir: str = cache_directory(), overwrite: bool = False, latest: bool = True +) -> pd.DataFrame: + """List all available datasets. + + Example: + + .. code-block:: python + + datasets = list_datasets() + + Args: + cache_dir: + Path to cache directory. + + overwrite: + Whether to overwrite the database in cache. + Defaults to False. + + latest: + Whether to only fetch the latest version of each dataset. + Defaults to True. + + Returns: + A pandas DataFrame where each row corresponds to a dataset. + Each row contains title and description for each dataset, + the number of rows and columns, the organisms and genome builds involved, + whether the dataset has any pre-computed reduced dimensions, and so on. + More details can be found in the + `Bioconductor metadata schema `_. + """ + db_path = fetch_metadata_database(cache_dir=cache_dir, overwrite=overwrite) + conn = sqlite3.connect(db_path, check_same_thread=False) + + stmt = "SELECT json_extract(metadata, '$') AS meta, versions.asset AS asset, versions.version AS version, path" + key_names = ["meta", "asset", "version", "path"] + if latest is not True: + stmt = f"{stmt} versions.latest AS latest" + key_names.append("latest") + + stmt = f"{stmt} FROM paths LEFT JOIN versions ON paths.vid = versions.vid WHERE versions.project = 'scRNAseq'" + if latest is True: + stmt = f"{stmt} AND versions.latest = 1" + + _qresults = conn.execute(stmt).fetchall() + results = _format_query_results(_qresults, key_names) + conn.close() + + return _sanitize_query_to_output(results, latest) + + +def _format_query_results(results: list, key_names: list): + """Format the results from sqlite as a pandas dataframe + + Key names must be in the exact same order as the query. + """ + _out = {} + for k in key_names: + _out[k] = [] + + for r in results: + for idx, k in enumerate(key_names): + _out[k].append(r[idx]) + + return _out + + +def _sanitize_query_to_output(results: list, latest: bool, meta_name: str = "meta"): + _all_paths = [ + None if "/" not in p else p.rsplit("/", 1)[0] for p in results["path"] + ] + + df = pd.DataFrame( + { + "name": results["asset"], + "version": results["version"], + "path": _all_paths, + } + ) + if not latest: + _all_latest = [s == 1 for s in results["latest"]] + df["latest"] = _all_latest + + _all_metas = [json.loads(s) for s in results[meta_name]] + + df["object"] = _extract_atomic_from_json( + _all_metas, lambda x: x.get("applications", {}).get("takane", {}).get("type") + ) + df["title"] = _extract_atomic_from_json(_all_metas, lambda x: x.get("title")) + df["description"] = _extract_atomic_from_json(_all_metas, lambda x: x.get("title")) + df["taxonomy_id"] = _extract_charlist_from_json( + _all_metas, lambda x: x.get("taxonomy_id") + ) + df["genome"] = _extract_charlist_from_json(_all_metas, lambda x: x.get("genome")) + + df["rows"] = _extract_atomic_from_json( + _all_metas, + lambda x: x.get("applications", {}) + .get("takane", {}) + .get("summarized_experiment", {}) + .get("rows"), + ) + + df["columns"] = _extract_atomic_from_json( + _all_metas, + lambda x: x.get("applications", {}) + .get("takane", {}) + .get("summarized_experiment", {}) + .get("columns"), + ) + + df["assays"] = _extract_charlist_from_json( + _all_metas, + lambda x: x.get("applications", {}) + .get("takane", {}) + .get("summarized_experiment", {}) + .get("assays"), + ) + df["column_annotations"] = _extract_charlist_from_json( + _all_metas, + lambda x: x.get("applications", {}) + .get("takane", {}) + .get("summarized_experiment", {}) + .get("column_annotations"), + ) + df["reduced_dimensions"] = _extract_charlist_from_json( + _all_metas, + lambda x: x.get("applications", {}) + .get("takane", {}) + .get("single_cell_experiment", {}) + .get("reduced_dimensions"), + ) + df["alternative_experiments"] = _extract_charlist_from_json( + _all_metas, + lambda x: x.get("applications", {}) + .get("takane", {}) + .get("single_cell_experiment", {}) + .get("alternative_experiments"), + ) + + df["bioconductor_version"] = _extract_atomic_from_json( + _all_metas, lambda x: x.get("bioconductor_version") + ) + df["maintainer_name"] = _extract_atomic_from_json( + _all_metas, lambda x: x.get("maintainer_name") + ) + df["maintainer_email"] = _extract_atomic_from_json( + _all_metas, lambda x: x.get("maintainer_email") + ) + + sources = [] + for meta in _all_metas: + cursources = meta.get("sources") + if cursources is None: + sources.append(pd.DataFrame(columns=["provider", "id", "version"])) + else: + sources.append( + pd.DataFrame( + { + "provider": [s.get("provider") for s in cursources], + "id": [s.get("id") for s in cursources], + "version": [s.get("version") for s in cursources], + } + ) + ) + df["sources"] = sources + + return df + + +def _extract_atomic_from_json(metadata, extract): + return [ + extract(_meta) if extract(_meta) is not None else None for _meta in metadata + ] + + +def _extract_charlist_from_json(metadata, extract): + return [extract(_meta) if extract(_meta) is not None else [] for _meta in metadata] diff --git a/src/scrnaseq/list_versions.py b/src/scrnaseq/list_versions.py new file mode 100644 index 0000000..ae1023b --- /dev/null +++ b/src/scrnaseq/list_versions.py @@ -0,0 +1,45 @@ +from typing import List + +import gypsum_client as gypc + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def list_versions(name: str) -> List[str]: + """List all available versions for a dataset. + + Example: + + .. code-block:: python + + versions = list_versions("romanov-brain-2017") + + Args: + name: + Name of the dataset. + + Returns: + A list of version names. + """ + return gypc.list_versions("scRNAseq", name) + + +def fetch_latest_version(name: str) -> str: + """Fetch latest version for a dataset. + + Example: + + .. code-block:: python + + version = fetch_latest_version("romanov-brain-2017") + + Args: + name: + Name of the dataset. + + Returns: + Latest version name. + """ + return gypc.fetch_latest("scRNAseq", name) diff --git a/src/scrnaseq/save_dataset.py b/src/scrnaseq/save_dataset.py new file mode 100644 index 0000000..e73d21e --- /dev/null +++ b/src/scrnaseq/save_dataset.py @@ -0,0 +1,131 @@ +import json +import os +import shutil +from functools import singledispatch +from typing import Any + +import dolomite_base as dl +from gypsum_client import fetch_metadata_schema, validate_metadata +from singlecellexperiment import SingleCellExperiment + +from .utils import format_object_metadata + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +@singledispatch +def save_dataset(x: Any, path, metadata): + """Save a dataset to disk. + + Args: + x: + An object containing single-cell data. + May be a derivative of SummarizedExperiment or AnnData. + + path: + Path to a new directory to save the dataset. + + metadata: + Dictionary containing the metadata for this dataset. + see the schema returned by :py:func:`~gypsum.fetch_metadata_schema`. + + Note that the ``applications.takane`` property will be automatically + added by this function and does not have to be supplied. + + See Also: + `metadata index `_, + on the expected schema for the metadata. + + :py:func:`~scrnaseq.polish_dataset.polish_dataset`, + to polish ``x`` before saving it. + + :py:func:`~gypsum.upload_directory`, to upload the saved contents. + + Example: + + .. code-block:: python + + # Fetch an existing dataset + # or create your own ``SingleCellExperiment`` + # or ``AnnData`` object. + sce = scrnaseq.fetch_dataset("zeisel-brain-2015", "2023-12-14") + + # Provide dataset level metadata for search and findability + meta = { + "title": "My dataset made from ziesel brain", + "description": "This is a copy of the ziesel", + "taxonomy_id": ["10090"], # NCBI ID + "genome": ["GRCh38"], # genome build + "sources": [{"provider": "GEO", "id": "GSE12345"}], + "maintainer_name": "Shizuka Mogami", + "maintainer_email": "mogami.shizuka@765pro.com", + } + + import shutil + import tempfile + + cache_dir = tempfile.mkdtemp() + + # Make sure the directory is clean + shutil.rmtree(cache_dir) + + # Save the dataset + scrnaseq.save_dataset(sce, cache_dir, meta) + """ + raise NotImplementedError( + f"'save_dataset' is not supported for objects of class: {type(x)}" + ) + + +@save_dataset.register +def save_dataset_sce(x: SingleCellExperiment, path: str, metadata: dict): + """Save :py:class:`~singlecellexperiment.SingleCellExperiment.SingleCellExperiment` to disk.""" + schema = fetch_metadata_schema() + + if "bioconductor_version" not in metadata: + metadata["bioconductor_version"] = "3.14" # current release + + validate_metadata.validate_metadata(metadata, schema) + + if os.path.exists(path): + shutil.rmtree(path) + + dl.save_object( + x, + path, + ) + + takane = format_object_metadata(x) + takane["type"] = dl.read_object_file(path)["type"] + + if "applications" not in metadata: + metadata["applications"] = {} + + metadata["applications"]["takane"] = takane + + # Second validation with the takane metadata. + contents = json.dumps(metadata, indent=4) + validate_metadata.validate_metadata(json.loads(contents), schema=schema) + with open(os.path.join(path, "_bioconductor.json"), "w") as f: + f.write(contents) + + return + + +has_anndata = False +try: + import anndata + + has_anndata = True +except Exception: + pass + +if has_anndata: + + @save_dataset.register + def save_dataset_anndata(x: anndata.AnnData, path: str, metadata: dict): + """Save :py:class:`~anndata.AnnData` to disk.""" + _sce = SingleCellExperiment.from_anndata(x) + return save_dataset(_sce, path, metadata) diff --git a/src/scrnaseq/utils.py b/src/scrnaseq/utils.py new file mode 100644 index 0000000..e101908 --- /dev/null +++ b/src/scrnaseq/utils.py @@ -0,0 +1,130 @@ +from biocframe import BiocFrame +from delayedarray import is_sparse, to_dense_array, to_scipy_sparse_matrix +from dolomite_base import read_object +from singlecellexperiment import SingleCellExperiment +from summarizedexperiment import SummarizedExperiment + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def single_cell_load_object( + path: str, + metadata: dict = None, + scrnaseq_realize_assays: bool = False, + scrnaseq_realize_reduced_dims: bool = True, + **kwargs, +): + """Load a ``SummarizedExperiment`` or ``SingleCellExperiment`` object from a file. + + Args: + path: + Path to the dataset. + + metadata: + Metadata for the dataset. + Defaults to None. + + scrnaseq_realize_assays: + Whether to realize assays into memory. + Defaults to False. + + scrnaseq_realize_reduced_dims: + Whether to realize reduced dimensions into memory. + Defaults to True. + + **kwargs: + Further arguments to pass to + :py:func:`~dolomite_base.read_object.read_object`. + + Returns: + A `SingleCellExperiment` or a `SummarizedExperiment` derivative of the object. + """ + obj = read_object( + path, + metadata=metadata, + scrnaseq_realize_assays=scrnaseq_realize_assays, + scrnaseq_realize_reduced_dims=scrnaseq_realize_reduced_dims, + **kwargs, + ) + + if isinstance(obj, SummarizedExperiment): + if scrnaseq_realize_assays: + _assays = {} + for y in obj.get_assay_names(): + _assays[y] = realize_array(obj.assay(y)) + + obj = obj.set_assays(_assays) + + if isinstance(obj, SingleCellExperiment): + if scrnaseq_realize_reduced_dims: + _red_dims = {} + for z in obj.get_reduced_dim_names(): + _red_dims[z] = realize_array(obj.reduced_dim(z)) + + obj = obj.set_reduced_dims(_red_dims) + + return obj + + +def realize_array(x): + """Realize a `ReloadedArray` into a dense array or sparse matrix. + + Args: + x: + `ReloadedArray` object. + + Returns: + + Realized array or matrix. + """ + from dolomite_matrix import ReloadedArray + + if isinstance(x, ReloadedArray): + if is_sparse(x): + x = to_scipy_sparse_matrix(x, "csc") + else: + x = to_dense_array(x) + + return x + + +def format_object_metadata(x) -> dict: + """Format object related metadata. + + Create object-related metadata to validate against the default + schema from :py:func:`~gypsum.fetch_metadata_schema`. + This is intended for downstream package developers who are + auto-generating metadata documents to be validated by + :py:func:`~gypsum.validate_metadata`. + + Args: + x: + An Python object, typically an instance of a BiocPy class. + + Returns: + Dictionary containing metadata for the object. + """ + _meta = {} + + if isinstance(x, SummarizedExperiment): + _meta["summarized_experiment"] = { + "rows": x.shape[0], + "columns": x.shape[1], + "assays": list(x.get_assay_names()), + "column_annotations": list(x.get_column_names()), + } + + if isinstance(x, SingleCellExperiment): + _meta["single_cell_experiment"] = { + "reduced_dimensions": list(x.get_reduced_dim_names()), + "alternative_experiments": list(x.get_alternative_experiment_names()), + } + elif isinstance(x, BiocFrame): + _meta["data_frame"] = { + "rows": len(x), + "column_names": list(x.get_column_names()), + } + + return _meta diff --git a/tests/test_fetch_dataset.py b/tests/test_fetch_dataset.py index 753a815..f9ce31b 100644 --- a/tests/test_fetch_dataset.py +++ b/tests/test_fetch_dataset.py @@ -1,5 +1,9 @@ +import numpy as np import pytest -from scrnaseq import fetch_dataset +import scipy.sparse as sp +from delayedarray import is_sparse +from dolomite_matrix import ReloadedArray +from scrnaseq import fetch_dataset, fetch_metadata from singlecellexperiment import SingleCellExperiment __author__ = "Jayaram Kancherla" @@ -9,6 +13,39 @@ def test_fetch_dataset(): sce = fetch_dataset("zeisel-brain-2015", "2023-12-14") - - print(sce) assert isinstance(sce, SingleCellExperiment) + + # Correctly creates ReloadedMatrix objects. + ass = sce.get_assays() + assert all(isinstance(a, ReloadedArray) for _, a in ass.items()) + assert all(is_sparse(a) for _, a in ass.items()) + + ass_0 = ass["counts"] + assert "zeisel-brain-2015" in ass_0.seed.path + assert "2023-12-14" in ass_0.seed.path + + # Works with realization options. + sce = fetch_dataset("zeisel-brain-2015", "2023-12-14", realize_assays=True) + ass = sce.get_assays() + assert all(isinstance(a, (sp.csc_matrix, sp.csr_matrix)) for _, a in ass.items()) + + alt_exps = sce.get_alternative_experiments() + for altname, alt in alt_exps.items(): + alt_exp_ass = alt.get_assays() + assert all(isinstance(a, (np.ndarray)) for _, a in alt_exp_ass.items()) + + +def test_fetch_dataset_realizes_reduced_dimensions(): + sce = fetch_dataset("aztekin-tail-2019", "2023-12-14", realize_reduced_dims=False) + red_dim = sce.get_reduced_dims() + assert all(isinstance(a, ReloadedArray) for _, a in red_dim.items()) + + sce = fetch_dataset("aztekin-tail-2019", "2023-12-14", realize_reduced_dims=True) + red_dim = sce.get_reduced_dims() + assert all(isinstance(a, np.ndarray) for _, a in red_dim.items()) + + +def test_fetch_metadata(): + meta = fetch_metadata("zeisel-brain-2015", "2023-12-14") + assert "Brain structure" in meta["title"] + assert meta["taxonomy_id"][0] == "10090" diff --git a/tests/test_list_dataset.py b/tests/test_list_dataset.py new file mode 100644 index 0000000..380ad46 --- /dev/null +++ b/tests/test_list_dataset.py @@ -0,0 +1,15 @@ +import tempfile + +import pandas as pd +from scrnaseq import list_datasets + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_list_dataset(): + datasets = list_datasets(cache_dir=tempfile.mkdtemp()) + + assert isinstance(datasets, pd.DataFrame) + assert len(datasets) > 80 diff --git a/tests/test_list_version.py b/tests/test_list_version.py new file mode 100644 index 0000000..31ecf96 --- /dev/null +++ b/tests/test_list_version.py @@ -0,0 +1,19 @@ +from scrnaseq import fetch_latest_version, list_versions + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_list_versions(): + versions = list_versions("romanov-brain-2017") + + assert isinstance(versions, list) + assert "2023-12-19" in versions + + +def test_latest_version(): + version = fetch_latest_version("romanov-brain-2017") + + assert isinstance(version, str) + assert "2023-12-19" == version diff --git a/tests/test_save_dataset.py b/tests/test_save_dataset.py new file mode 100644 index 0000000..7396f6d --- /dev/null +++ b/tests/test_save_dataset.py @@ -0,0 +1,99 @@ +import json +import os +import shutil +import tempfile + +import anndata as ad +import dolomite_base as dl +import numpy as np +import pandas as pd +import pytest +from dolomite_matrix import ReloadedArray +from scrnaseq import fetch_dataset, save_dataset +from delayedarray import to_dense_array + +__author__ = "Jayaram Kancherla" +__copyright__ = "Jayaram Kancherla" +__license__ = "MIT" + + +def test_save_dataset_sce(): + sce = fetch_dataset("zeisel-brain-2015", "2023-12-14") + + meta = { + "title": "My dataset forked from ziesel brain", + "description": "This is a copy of the ziesel", + "taxonomy_id": ["10090"], # NCBI ID + "genome": ["GRCh38"], # genome build + "sources": [{"provider": "GEO", "id": "GSE12345"}], + "maintainer_name": "Shizuka Mogami", + "maintainer_email": "mogami.shizuka@765pro.com", + } + + tmp = tempfile.mkdtemp() + save_dataset(sce, tmp, meta) + + # Load the saved AnnData object + roundtrip = dl.read_object(tmp) + + assert len(roundtrip.get_column_data()) == len(sce.get_column_data()) + assert isinstance(roundtrip.get_assays()["counts"], ReloadedArray) + assert isinstance(sce.get_assays()["counts"], ReloadedArray) + + # Load and check the metadata + with open(os.path.join(tmp, "_bioconductor.json")) as f: + saved_meta = json.load(f) + + assert saved_meta["bioconductor_version"] == "3.14" # Placeholder version + + # Test validation failure + meta["title"] = 1234 + with pytest.raises(Exception): + save_dataset(sce, tmp, meta) + + shutil.rmtree(tmp) + + +def test_save_dataset_anndata(): + data = np.random.poisson(1, (10, 100)) + adata = ad.AnnData(data) + adata.obs["foo"] = np.random.choice(list("abcdefghijklmnopqrstuvwxyz"), 10) + adata.var_names = [f"GENE_{i+1}" for i in range(adata.n_vars)] + adata.obs_names = list("ABCDEFGHIJ") + adata.layers["counts"] = data + + meta = { + "title": "My dataset forked from ziesel brain", + "description": "This is a copy of the ziesel", + "taxonomy_id": ["10090"], # NCBI ID + "genome": ["GRCh38"], # genome build + "sources": [{"provider": "GEO", "id": "GSE12345"}], + "maintainer_name": "Shizuka Mogami", + "maintainer_email": "mogami.shizuka@765pro.com", + } + + tmp = tempfile.mkdtemp() + save_dataset(adata, tmp, meta) + + # Load the saved AnnData object + roundtrip = dl.read_object(tmp) + + assert len(roundtrip.get_column_data()) == 10 + assert isinstance(roundtrip.get_assays()["counts"], ReloadedArray) + assert isinstance(adata.layers["counts"], np.ndarray) + assert np.array_equal( + to_dense_array(roundtrip.get_assays()["counts"]).transpose(), adata.layers["counts"] + ) + + # Load and check the metadata + with open(os.path.join(tmp, "_bioconductor.json")) as f: + saved_meta = json.load(f) + + assert saved_meta["bioconductor_version"] == "3.14" # Placeholder version + + # Test validation failure + meta["title"] = 1234 + with pytest.raises(Exception): + save_dataset(adata, tmp, meta) + + shutil.rmtree(tmp)