hubmapconsortium · ilan-gold · Oct 10, 2021 · Oct 10, 2021
diff --git a/containers/snap-to-anndata-zarr/Dockerfile b/containers/snap-to-anndata-zarr/Dockerfile
@@ -0,0 +1,23 @@
+# We just need to use --file to point at it, instead of assuming it is in context.
+
+# Using Conda because pyarrow did not install easily on python base images.
+FROM continuumio/miniconda3:4.7.12
+
+# For tiff packages
+RUN apt-get --allow-releaseinfo-change update &&\
+      apt-get install -y gcc python3-dev g++
+COPY requirements-freeze.txt .
+RUN pip install  -r ./requirements-freeze.txt
+
+# In development, you may want to pin a single dependency in requirements.txt,
+# without throwing away the entire cache layer from requirements-freeze.txt.
+# (But once it works, you should check in an updated freeze!)
+
+COPY requirements.txt .
+RUN pip install  -r ./requirements.txt
+
+COPY . .
+
+CMD [ "python", "main.py", \
+      "--input_dir", "/input", \
+      "--output_dir", "/output" ]
diff --git a/containers/snap-to-anndata-zarr/README.md b/containers/snap-to-anndata-zarr/README.md
@@ -0,0 +1,3 @@
+# snap-to-anndata-zarr
+
+This container saves [an AnnData store](https://anndata.readthedocs.io/en/latest/anndata.read_h5ad.html) in `zarr` format along with custom genomic profiles for scATAC-seq data from the [Snap](https://github.com/r3fang/SnapATAC) package.
diff --git a/containers/snap-to-anndata-zarr/VERSION b/containers/snap-to-anndata-zarr/VERSION
@@ -0,0 +1 @@
+0.0.1
diff --git a/containers/snap-to-anndata-zarr/context/main.py b/containers/snap-to-anndata-zarr/context/main.py
@@ -0,0 +1,44 @@
+import argparse
+from pathlib import Path
+from os import path
+
+import zarr
+from scipy.io import mmread
+import pandas as pd
+from vitessce import SnapWrapper
+
+NUM_MARKER_GENES_TO_VISUALIZE = 5
+VAR_CHUNK_SIZE = 10
+SECONDARY_ANALYSIS = "secondary_analysis.h5ad"
+SCVELO_ANNOTATED = "scvelo_annotated.h5ad"
+
+def main(input_dir, output_dir):
+    output_dir.mkdir(exist_ok=True)
+    mtx = mmread(path.join(input_dir, 'filtered_cell_by_bin.mtx'))
+    barcodes_df = pd.read_csv(path.join(input_dir, 'barcodes.txt'), header=None)
+    bins_df = pd.read_csv(path.join(input_dir, 'bins.txt'), header=None)
+    clusters_df = pd.read_csv(path.join(input_dir, 'umap_coords_clusters.csv'), index_col=0)
+    zarr_filepath = path.join(output_dir, 'hubmap-ui.snap.multires.zarr')
+    w = SnapWrapper(mtx, barcodes_df, bins_df, clusters_df)
+    # In theory, it would be nice to create an AnnData store instead of json
+    # We could then attach things to it, like clusters in `uns`
+    # w.create_anndata()
+    w.create_genomic_multivec_zarr(zarr_filepath)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=f"Transform Snap into zarr.")
+    parser.add_argument(
+        "--input_dir",
+        required=True,
+        type=Path,
+        help="directory containing HuBMAP SnapATAC data",
+    )
+    parser.add_argument(
+        "--output_dir",
+        required=True,
+        type=Path,
+        help="directory where (AnnData) zarr files should be written",
+    )
+    args = parser.parse_args()
+    main(args.input_dir, args.output_dir)
diff --git a/containers/snap-to-anndata-zarr/context/requirements-freeze.txt b/containers/snap-to-anndata-zarr/context/requirements-freeze.txt
@@ -0,0 +1,95 @@
+aiofiles==0.7.0
+anndata==0.7.1
+argcomplete==1.12.3
+argon2-cffi==21.1.0
+asciitree==0.3.3
+asn1crypto==1.0.1
+attrs==21.2.0
+backcall==0.2.0
+bleach==4.1.0
+certifi==2019.9.11
+cffi==1.12.3
+chardet==3.0.4
+conda==4.7.12
+conda-package-handling==1.6.0
+cryptography==2.7
+debugpy==1.5.0
+decorator==5.1.0
+defusedxml==0.7.1
+entrypoints==0.3
+fasteners==0.16
+generate-tiff-offsets==0.1.7
+h11==0.12.0
+h2==4.1.0
+h5py==2.10.0
+hpack==4.0.0
+Hypercorn==0.11.2
+hyperframe==6.0.1
+idna==2.8
+importlib-metadata==3.7.0
+ipykernel==6.4.1
+ipython==7.28.0
+ipython-genutils==0.2.0
+ipywidgets==7.6.5
+jedi==0.18.0
+Jinja2==3.0.2
+jsonschema==4.0.1
+jupyter-client==7.0.6
+jupyter-core==4.8.1
+jupyterlab-pygments==0.1.2
+jupyterlab-widgets==1.0.2
+MarkupSafe==2.0.1
+matplotlib-inline==0.1.3
+mistune==0.8.4
+natsort==7.1.1
+nbclient==0.5.4
+nbconvert==6.2.0
+nbformat==5.1.3
+negspy==0.2.24
+nest-asyncio==1.5.1
+notebook==6.4.4
+numcodecs==0.7.3
+numpy==1.20.1
+packaging==20.9
+pandas==1.2.3
+pandocfilters==1.5.0
+parso==0.8.2
+pexpect==4.8.0
+pickleshare==0.7.5
+priority==2.0.0
+prometheus-client==0.11.0
+prompt-toolkit==3.0.20
+ptyprocess==0.7.0
+pycosat==0.6.3
+pycparser==2.19
+Pygments==2.10.0
+pyOpenSSL==19.0.0
+pyparsing==2.4.7
+pyrsistent==0.18.0
+PySocks==1.7.1
+python-dateutil==2.8.1
+pytz==2021.1
+pyzmq==22.3.0
+requests==2.22.0
+ruamel-yaml==0.15.46
+scipy==1.6.1
+Send2Trash==1.8.0
+six==1.12.0
+starlette==0.14.0
+terminado==0.12.1
+testpath==0.5.0
+tifffile==2020.10.1
+toml==0.10.2
+tornado==6.1
+tqdm==4.36.1
+traitlets==5.1.0
+typing-extensions==3.7.4.3
+ujson==4.2.0
+urllib3==1.24.2
+vitessce==1.0.4
+wcwidth==0.2.5
+webencodings==0.5.1
+widgetsnbextension==3.5.1
+wsproto==1.0.0
+zarr==2.6.1
+zipp==3.4.1
diff --git a/containers/snap-to-anndata-zarr/context/requirements.txt b/containers/snap-to-anndata-zarr/context/requirements.txt
@@ -0,0 +1,6 @@
+anndata==0.7.1
+zarr==2.6.1
+vitessce==1.0.4
+pandas==1.2.3
+scipy==1.6.1
+
diff --git a/containers/snap-to-anndata-zarr/create-test-input.py b/containers/snap-to-anndata-zarr/create-test-input.py
@@ -0,0 +1,103 @@
+from scipy.sparse import coo_matrix
+from scipy.io import mmwrite
+import pandas as pd
+import numpy as np
+
+def create_test_snaptools_files(mtx_path, bins_path, barcodes_path, clusters_path):
+    bins_arr = [
+        '1:10001-15000',
+        '1:15001-20000',
+        '1:65001-70000',
+        '1:80001-85000',
+        '1:105001-110000',
+        '1:115001-120000',
+        '1:270001-275000',
+        '2:10001-15000',
+        '2:15001-20000',
+        '2:20001-25000',
+        '2:25001-30000',
+        '2:30001-35000',
+        '2:35001-40000',
+        '2:55001-60000',
+        '3:15001-20000',
+        '18:10001-15000',
+    ]
+    bins_df = pd.DataFrame(
+        data=[
+            {'bin': bin_str}
+            for bin_str in bins_arr
+        ]
+    )
+
+    barcodes_arr = [
+        'AAACATCGAACGCTTAACGTATCA',
+        'AAACATCGACACGACCACACAGAA',
+        'AAACATCGAGTACAAGACAGCAGA',
+        'AAACATCGATCCTGTAAACCGAGA',
+        'AAACATCGCACCTTACACACAGAA',
+        'AAACATCGCACTTCGAAACCGAGA',
+    ]
+    barcodes_df = pd.DataFrame(
+        data=[
+            {'barcode': barcode_str}
+            for barcode_str in barcodes_arr
+        ]
+    )
+
+    clusters_df = pd.DataFrame(
+        data=[
+            {
+                'barcode': 'AAACATCGAACGCTTAACGTATCA',
+                'umap.1': 0.45,
+                'umap.2': 1.69,
+                'cluster': '4',
+            },
+            {
+                'barcode': 'AAACATCGACACGACCACACAGAA',
+                'umap.1': -1.27,
+                'umap.2': -1.16,
+                'cluster': '4',
+            },
+            {
+                'barcode': 'AAACATCGAGTACAAGACAGCAGA',
+                'umap.1': 4.43,
+                'umap.2': 1.64,
+                'cluster': '10',
+            },
+            {
+                'barcode': 'AAACATCGATCCTGTAAACCGAGA',
+                'umap.1': -0.84,
+                'umap.2': 1.57,
+                'cluster': '3',
+            },
+            {
+                'barcode': 'AAACATCGCACCTTACACACAGAA',
+                'umap.1': 0.54,
+                'umap.2': 0.11,
+                'cluster': '2',
+            },
+            {
+                'barcode': 'AAACATCGCACTTCGAAACCGAGA',
+                'umap.1': 1.24,
+                'umap.2': 1.43,
+                'cluster': '3',
+            },
+        ]
+    )
+    clusters_df = clusters_df.set_index('barcode')
+
+    bins_df.to_csv(bins_path, sep='\t', index=False, header=False)
+    barcodes_df.to_csv(barcodes_path, sep='\t', index=False, header=False)
+    clusters_df.to_csv(clusters_path, index=True)
+
+    mtx = np.array([
+        [0, 2, 1, 3, 0, 4, 9, 0, 0, 1, 0, 0, 0, 1, 0, 3],
+        [1, 1, 3, 1, 0, 0, 0, 0, 0, 2, 2, 3, 4, 2, 4, 3],
+        [0, 1, 1, 1, 1, 1, 3, 2, 1, 0, 0, 0, 1, 3, 5, 0],
+        [0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 2, 1, 3, 1, 0, 0],
+        [2, 3, 2, 4, 1, 2, 3, 1, 0, 0, 0, 0, 1, 0, 0, 0],
+        [4, 0, 4, 0, 1, 0, 0, 0, 8, 1, 4, 3, 2, 1, 0, 2],
+    ])
+    coo_mtx = coo_matrix(mtx)
+
+    mmwrite(mtx_path, coo_mtx)
diff --git a/containers/snap-to-anndata-zarr/test-input/barcodes.txt b/containers/snap-to-anndata-zarr/test-input/barcodes.txt
@@ -0,0 +1,6 @@
+AAACATCGAACGCTTAACGTATCA
+AAACATCGACACGACCACACAGAA
+AAACATCGAGTACAAGACAGCAGA
+AAACATCGATCCTGTAAACCGAGA
+AAACATCGCACCTTACACACAGAA
+AAACATCGCACTTCGAAACCGAGA
diff --git a/containers/snap-to-anndata-zarr/test-input/bins.txt b/containers/snap-to-anndata-zarr/test-input/bins.txt
@@ -0,0 +1,16 @@
+1:10001-15000
+1:15001-20000
+1:65001-70000
+1:80001-85000
+1:105001-110000
+1:115001-120000
+1:270001-275000
+2:10001-15000
+2:15001-20000
+2:20001-25000
+2:25001-30000
+2:30001-35000
+2:35001-40000
+2:55001-60000
+3:15001-20000
+18:10001-15000
diff --git a/containers/snap-to-anndata-zarr/test-input/filtered_cell_by_bin.mtx b/containers/snap-to-anndata-zarr/test-input/filtered_cell_by_bin.mtx
@@ -0,0 +1,61 @@
+%%MatrixMarket matrix coordinate integer general
+%
+6 16 58
+1 2 2
+1 3 1
+1 4 3
+1 6 4
+1 7 9
+1 10 1
+1 14 1
+1 16 3
+2 1 1
+2 2 1
+2 3 3
+2 4 1
+2 10 2
+2 11 2
+2 12 3
+2 13 4
+2 14 2
+2 15 4
+2 16 3
+3 2 1
+3 3 1
+3 4 1
+3 5 1
+3 6 1
+3 7 3
+3 8 2
+3 9 1
+3 13 1
+3 14 3
+3 15 5
+4 3 1
+4 4 1
+4 6 1
+4 7 1
+4 8 1
+4 11 2
+4 12 1
+4 13 3
+4 14 1
+5 1 2
+5 2 3
+5 3 2
+5 4 4
+5 5 1
+5 6 2
+5 7 3
+5 8 1
+5 13 1
+6 1 4
+6 3 4
+6 5 1
+6 9 8
+6 10 1
+6 11 4
+6 12 3
+6 13 2
+6 14 1
+6 16 2
diff --git a/containers/snap-to-anndata-zarr/test-input/umap_coords_clusters.csv b/containers/snap-to-anndata-zarr/test-input/umap_coords_clusters.csv
@@ -0,0 +1,7 @@
+barcode,umap.1,umap.2,cluster
+AAACATCGAACGCTTAACGTATCA,0.45,1.69,4
+AAACATCGACACGACCACACAGAA,-1.27,-1.16,4
+AAACATCGAGTACAAGACAGCAGA,4.43,1.64,10
+AAACATCGATCCTGTAAACCGAGA,-0.84,1.57,3
+AAACATCGCACCTTACACACAGAA,0.54,0.11,2
+AAACATCGCACTTCGAAACCGAGA,1.24,1.43,3
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# snap-to-anndata-zarr

		This container saves [an AnnData store](https://anndata.readthedocs.io/en/latest/anndata.read_h5ad.html) in `zarr` format along with custom genomic profiles for scATAC-seq data from the [Snap](https://github.com/r3fang/SnapATAC) package.