hubmapconsortium · mruffalo · Jun 18, 2020 · Jun 18, 2020 · Jun 18, 2020 · Jun 18, 2020
diff --git a/containers/scatac-csv-to-arrow/Dockerfile b/containers/scatac-csv-to-arrow/Dockerfile
@@ -0,0 +1,23 @@
+# We just need to use --file to point at it, instead of assuming it is in context.
+
+# Using Conda because pyarrow did not install easily on python base images.
+FROM continuumio/miniconda3:4.7.12
+
+# For tiff packages
+RUN apt-get update &&\
+      apt-get install -y gcc python3-dev
+COPY requirements-freeze.txt .
+RUN pip install  -r ./requirements-freeze.txt
+
+# In development, you may want to pin a single dependency in requirements.txt,
+# without throwing away the entire cache layer from requirements-freeze.txt.
+# (But once it works, you should check in an updated freeze!)
+
+COPY requirements.txt .
+RUN pip install  -r ./requirements.txt
+
+COPY . .
+
+CMD [ "python", "main.py", \
+      "--input_dir", "/input", \
+      "--output_dir", "/output" ]
diff --git a/containers/scatac-csv-to-arrow/README.md b/containers/scatac-csv-to-arrow/README.md
@@ -0,0 +1,5 @@
+# scatac-csv-to-arrow
+
+Translate CSV from the HuBMAP scATAC-seq pipeline to
+[Apache Arrow](https://arrow.apache.org/),
+as well as normalized CSV, and Vitessce JSON.
diff --git a/containers/scatac-csv-to-arrow/VERSION b/containers/scatac-csv-to-arrow/VERSION
@@ -0,0 +1 @@
+0.0.1
diff --git a/containers/scatac-csv-to-arrow/context/main.py b/containers/scatac-csv-to-arrow/context/main.py
@@ -0,0 +1,110 @@
+import argparse
+from pathlib import Path
+import json
+
+import pandas as pd
+import pyarrow as pa
+
+
+def csv_to_arrow(csv_file: Path, arrow_file: Path):
+    df = pd.read_csv(csv_file, index_col=0)
+    df.index.name = 'index'
+    df.columns = ['umap_x', 'umap_y', 'leiden']
+
+    table = pa.Table.from_pandas(df)
+
+    writer = pa.RecordBatchFileWriter(arrow_file, table.schema)
+    writer.write(table)
+    writer.close()
+
+
+def arrow_to_csv(arrow_file, csv_file):
+    df = pa.ipc.open_file(arrow_file).read_pandas()
+    df.to_csv(csv_file)
+
+
+# Big TODO: deduplicate this with h5ad-to-arrow
+def arrow_to_json(arrow_file, **kwargs):
+    umap_json  = kwargs['umap_json']
+    leiden_json  = kwargs['leiden_json']
+    cell_sets_json = kwargs['cell_sets_json']
+    df = pa.ipc.open_file(arrow_file).read_pandas()
+    df_items = df.T.to_dict().items()
+
+    id_to_umap = {
+        k: {
+            "mappings": {"UMAP": [v['umap_x'], v['umap_y']]},
+            "factors": {"Leiden Cluster": str(int(v['leiden']))}
+        }
+        for (k,v) in df_items
+    }
+    pretty_json_umap = json.dumps(id_to_umap).replace('}},', '}},\n')
+    with open(umap_json, 'w') as f:
+        f.write(pretty_json_umap)
+
+    leiden_clusters = sorted(df['leiden'].unique().astype('uint8'))
+    id_to_factors = {
+        'Leiden Cluster': {
+            'map': [str(cluster) for cluster in leiden_clusters],
+            'cells': { k: v['leiden'] for (k,v) in df_items }
+        }
+    }
+    pretty_json_factors = json.dumps(id_to_factors).replace('}},', '}},\n')
+    with open(leiden_json, 'w') as f:
+        f.write(pretty_json_factors)
+
+    # Construct the tree, according to the following schema:
+    # https://github.com/hubmapconsortium/vitessce/blob/d5f63aa1d08aa61f6b20f6ad6bbfba5fceb6b5ef/src/schemas/cell_sets.schema.json
+    cell_sets = {
+        "datatype": "cell",
+        "version": "0.1.2",
+        "tree": [
+            {
+                "name": "Leiden Cluster",
+                "children": [
+                    {
+                        "name": f"Cluster {cluster}",
+                        "set": df.loc[df['leiden'] == cluster].index.values.tolist(),
+                    }
+                    for cluster in leiden_clusters
+                ]
+            }
+        ]
+    }
+    with open(cell_sets_json, 'w') as f:
+        json.dump(cell_sets, f, indent=1)
+
+
+def main(input_dir: Path, output_dir: Path):
+    output_dir.mkdir(exist_ok=True, parents=True)
+    for input_path in input_dir.glob('**/umap_coords_clusters.csv'):
+        arrow_name = input_path.with_suffix('.arrow').name
+        arrow_path = output_dir / arrow_name
+        csv_to_arrow(input_path, arrow_path)
+        arrow_to_csv(arrow_path, arrow_path.with_suffix('.csv'))
+        arrow_to_json(
+            arrow_file=arrow_path,
+            umap_json=arrow_path.with_suffix('.cells.json'),
+            leiden_json=arrow_path.with_suffix('.factors.json'),
+            cell_sets_json=arrow_path.with_suffix('.cell-sets.json'),
+        )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='Transform raw pipeline CSV into Arrow, JSON, and standardized CSV.',
+    )
+    parser.add_argument(
+        '--input_dir',
+        required=True,
+        help='directory containing csv files to read',
+        type=Path,
+    )
+    parser.add_argument(
+        '--output_dir',
+        required=True,
+        help='directory where arrow files should be written',
+        type=Path,
+    )
+    args = parser.parse_args()
+    main(args.input_dir, args.output_dir)
diff --git a/containers/scatac-csv-to-arrow/context/requirements-freeze.txt b/containers/scatac-csv-to-arrow/context/requirements-freeze.txt
@@ -0,0 +1,30 @@
+anndata==0.7.1
+asn1crypto==1.0.1
+certifi==2019.11.28
+cffi==1.12.3
+chardet==3.0.4
+conda==4.7.12
+conda-package-handling==1.6.0
+cryptography==2.7
+h5py==2.10.0
+idna==2.8
+importlib-metadata==1.6.0
+natsort==6.2.0
+numpy==1.17.4
+packaging==20.3
+pandas==0.25.3
+pyarrow==0.15.1
+pycosat==0.6.3
+pycparser==2.19
+pyOpenSSL==19.0.0
+pyparsing==2.4.7
+PySocks==1.7.1
+python-dateutil==2.8.1
+pytz==2019.3
+requests==2.22.0
+ruamel-yaml==0.15.46
+scipy==1.3.3
+six==1.13.0
+tqdm==4.36.1
+urllib3==1.24.2
+zipp==3.1.0
diff --git a/containers/scatac-csv-to-arrow/context/requirements.txt b/containers/scatac-csv-to-arrow/context/requirements.txt
@@ -0,0 +1,3 @@
+anndata==0.7.1
+pyarrow==0.15.1
+h5py==2.10.0
diff --git a/containers/scatac-csv-to-arrow/test-input/umap_coords_clusters.csv b/containers/scatac-csv-to-arrow/test-input/umap_coords_clusters.csv
@@ -0,0 +1,4 @@
+"","umap.1","umap.2","cluster"
+"CAT",-1,-1,0
+"TAG",0,0,1
+"ATG",1,1,2
diff --git a/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.arrow b/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.arrow
diff --git a/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.cell-sets.json b/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.cell-sets.json
@@ -0,0 +1,29 @@
+{
+ "datatype": "cell",
+ "version": "0.1.2",
+ "tree": [
+  {
+   "name": "Leiden Cluster",
+   "children": [
+    {
+     "name": "Cluster 0",
+     "set": [
+      "CAT"
+     ]
+    },
+    {
+     "name": "Cluster 1",
+     "set": [
+      "TAG"
+     ]
+    },
+    {
+     "name": "Cluster 2",
+     "set": [
+      "ATG"
+     ]
+    }
+   ]
+  }
+ ]
+}
diff --git a/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.cells.json b/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.cells.json
@@ -0,0 +1,3 @@
+{"CAT": {"mappings": {"UMAP": [-1, -1]}, "factors": {"Leiden Cluster": "0"}},
+ "TAG": {"mappings": {"UMAP": [0, 0]}, "factors": {"Leiden Cluster": "1"}},
+ "ATG": {"mappings": {"UMAP": [1, 1]}, "factors": {"Leiden Cluster": "2"}}}
diff --git a/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.csv b/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.csv
@@ -0,0 +1,4 @@
+index,umap_x,umap_y,leiden
+CAT,-1,-1,0
+TAG,0,0,1
+ATG,1,1,2
diff --git a/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.factors.json b/containers/scatac-csv-to-arrow/test-output-expected/umap_coords_clusters.factors.json
@@ -0,0 +1 @@
+{"Leiden Cluster": {"map": ["0", "1", "2"], "cells": {"CAT": 0, "TAG": 1, "ATG": 2}}}
diff --git a/scatac-csv-to-arrow-manifest.json b/scatac-csv-to-arrow-manifest.json
@@ -0,0 +1,18 @@
+[
+  {
+    "pattern": "output/(.*)\\.arrow",
+    "description": "Input data relevant for visualization saved in columnar Apache Arrow format."
+  },
+  {
+    "pattern": "output/(.*)\\.csv",
+    "description": "Input data relevant for visualization saved in columnar comma-separated-file format."
+  },
+  {
+    "pattern": "output/(.*)\\.cells\\.json",
+    "description": "JSON-formatted information about this scATAC-seq run including scatterplot coordinates and clustering."
+  },
+  {
+    "pattern": "output/(.*)\\.factors\\.json",
+    "description": "JSON-formatted information about this scATAC-seq's clustering."
+  }
+]
diff --git a/scatac-csv-to-arrow.cwl b/scatac-csv-to-arrow.cwl
@@ -0,0 +1,19 @@
+#!/usr/bin/env cwl-runner
+
+cwlVersion: v1.0
+class: CommandLineTool
+# TODO: Make main.py executable?
+baseCommand: ['python', '/main.py', '--output_dir', './output', '--input_dir']
+hints:
+  DockerRequirement:
+    dockerPull: hubmap/portal-container-scatac-csv-to-arrow:0.0.1
+inputs:
+  input_directory:
+    type: Directory
+    inputBinding:
+        position: 6
+outputs:
+  output_directory:
+    type: Directory
+    outputBinding:
+      glob: output
diff --git a/workflows/scatac-csv-to-arrow/test-input/umap_coords_clusters.csv b/workflows/scatac-csv-to-arrow/test-input/umap_coords_clusters.csv
@@ -0,0 +1,4 @@
+"","umap.1","umap.2","cluster"
+"CAT",-1,-1,0
+"TAG",0,0,1
+"ATG",1,1,2
diff --git a/workflows/scatac-csv-to-arrow/test-job.yml b/workflows/scatac-csv-to-arrow/test-job.yml
@@ -0,0 +1,3 @@
+input_directory:
+  class: Directory
+  path: test-input
diff --git a/workflows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.arrow b/workflows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.arrow
diff --git a/...flows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.cell-sets.json b/...flows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.cell-sets.json
@@ -0,0 +1,29 @@
+{
+ "datatype": "cell",
+ "version": "0.1.2",
+ "tree": [
+  {
+   "name": "Leiden Cluster",
+   "children": [
+    {
+     "name": "Cluster 0",
+     "set": [
+      "CAT"
+     ]
+    },
+    {
+     "name": "Cluster 1",
+     "set": [
+      "TAG"
+     ]
+    },
+    {
+     "name": "Cluster 2",
+     "set": [
+      "ATG"
+     ]
+    }
+   ]
+  }
+ ]
+}
diff --git a/workflows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.cells.json b/workflows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.cells.json
@@ -0,0 +1,3 @@
+{"CAT": {"mappings": {"UMAP": [-1, -1]}, "factors": {"Leiden Cluster": "0"}},
+ "TAG": {"mappings": {"UMAP": [0, 0]}, "factors": {"Leiden Cluster": "1"}},
+ "ATG": {"mappings": {"UMAP": [1, 1]}, "factors": {"Leiden Cluster": "2"}}}
diff --git a/workflows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.csv b/workflows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.csv
@@ -0,0 +1,4 @@
+index,umap_x,umap_y,leiden
+CAT,-1,-1,0
+TAG,0,0,1
+ATG,1,1,2
diff --git a/workflows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.factors.json b/workflows/scatac-csv-to-arrow/test-output-expected/output/umap_coords_clusters.factors.json
@@ -0,0 +1 @@
+{"Leiden Cluster": {"map": ["0", "1", "2"], "cells": {"CAT": 0, "TAG": 1, "ATG": 2}}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"Leiden Cluster": {"map": ["0", "1", "2"], "cells": {"CAT": 0, "TAG": 1, "ATG": 2}}}