Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scATAC-seq UMAP DR + clustering -> Arrow container #40

Merged
merged 4 commits into from
Jun 18, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions containers/scatac-csv-to-arrow/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# We just need to use --file to point at it, instead of assuming it is in context.

# Using Conda because pyarrow did not install easily on python base images.
FROM continuumio/miniconda3:4.7.12

# For tiff packages
RUN apt-get update &&\
apt-get install -y gcc python3-dev
COPY requirements-freeze.txt .
RUN pip install -r ./requirements-freeze.txt

# In development, you may want to pin a single dependency in requirements.txt,
# without throwing away the entire cache layer from requirements-freeze.txt.
# (But once it works, you should check in an updated freeze!)

COPY requirements.txt .
RUN pip install -r ./requirements.txt

COPY . .

CMD [ "python", "main.py", \
"--input_dir", "/input", \
"--output_dir", "/output" ]
5 changes: 5 additions & 0 deletions containers/scatac-csv-to-arrow/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# scatac-csv-to-arrow

Translate CSV from the HuBMAP scATAC-seq pipeline to
[Apache Arrow](https://arrow.apache.org/),
as well as normalized CSV, and Vitessce JSON.
1 change: 1 addition & 0 deletions containers/scatac-csv-to-arrow/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.0.1
110 changes: 110 additions & 0 deletions containers/scatac-csv-to-arrow/context/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
import argparse
from pathlib import Path
import json

import pandas as pd
import pyarrow as pa


def csv_to_arrow(csv_file: Path, arrow_file: Path):
df = pd.read_csv(csv_file, index_col=0)
df.index.name = 'index'
df.columns = ['umap_x', 'umap_y', 'leiden']

table = pa.Table.from_pandas(df)

writer = pa.RecordBatchFileWriter(arrow_file, table.schema)
writer.write(table)
writer.close()


def arrow_to_csv(arrow_file, csv_file):
df = pa.ipc.open_file(arrow_file).read_pandas()
df.to_csv(csv_file)


# Big TODO: deduplicate this with h5ad-to-arrow
def arrow_to_json(arrow_file, **kwargs):
umap_json = kwargs['umap_json']
leiden_json = kwargs['leiden_json']
cell_sets_json = kwargs['cell_sets_json']
df = pa.ipc.open_file(arrow_file).read_pandas()
df_items = df.T.to_dict().items()

id_to_umap = {
k: {
"mappings": {"UMAP": [v['umap_x'], v['umap_y']]},
"factors": {"Leiden Cluster": str(int(v['leiden']))}
}
for (k,v) in df_items
}
pretty_json_umap = json.dumps(id_to_umap).replace('}},', '}},\n')
with open(umap_json, 'w') as f:
f.write(pretty_json_umap)

leiden_clusters = sorted(df['leiden'].unique().astype('uint8'))
id_to_factors = {
'Leiden Cluster': {
'map': [str(cluster) for cluster in leiden_clusters],
'cells': { k: v['leiden'] for (k,v) in df_items }
}
}
pretty_json_factors = json.dumps(id_to_factors).replace('}},', '}},\n')
with open(leiden_json, 'w') as f:
f.write(pretty_json_factors)

# Construct the tree, according to the following schema:
# https://github.com/hubmapconsortium/vitessce/blob/d5f63aa1d08aa61f6b20f6ad6bbfba5fceb6b5ef/src/schemas/cell_sets.schema.json
cell_sets = {
"datatype": "cell",
"version": "0.1.2",
"tree": [
{
"name": "Leiden Cluster",
"children": [
{
"name": f"Cluster {cluster}",
"set": df.loc[df['leiden'] == cluster].index.values.tolist(),
}
for cluster in leiden_clusters
]
}
]
}
with open(cell_sets_json, 'w') as f:
json.dump(cell_sets, f, indent=1)


def main(input_dir: Path, output_dir: Path):
output_dir.mkdir(exist_ok=True, parents=True)
for input_path in input_dir.glob('**/umap_coords_clusters.csv'):
arrow_name = input_path.with_suffix('.arrow').name
arrow_path = output_dir / arrow_name
csv_to_arrow(input_path, arrow_path)
arrow_to_csv(arrow_path, arrow_path.with_suffix('.csv'))
arrow_to_json(
arrow_file=arrow_path,
umap_json=arrow_path.with_suffix('.cells.json'),
leiden_json=arrow_path.with_suffix('.factors.json'),
cell_sets_json=arrow_path.with_suffix('.cell-sets.json'),
)


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='Transform raw pipeline CSV into Arrow, JSON, and standardized CSV.',
)
parser.add_argument(
'--input_dir',
required=True,
help='directory containing csv files to read',
type=Path,
)
parser.add_argument(
'--output_dir',
required=True,
help='directory where arrow files should be written',
type=Path,
)
args = parser.parse_args()
main(args.input_dir, args.output_dir)
30 changes: 30 additions & 0 deletions containers/scatac-csv-to-arrow/context/requirements-freeze.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
anndata==0.7.1
asn1crypto==1.0.1
certifi==2019.11.28
cffi==1.12.3
chardet==3.0.4
conda==4.7.12
conda-package-handling==1.6.0
cryptography==2.7
h5py==2.10.0
idna==2.8
importlib-metadata==1.6.0
natsort==6.2.0
numpy==1.17.4
packaging==20.3
pandas==0.25.3
pyarrow==0.15.1
pycosat==0.6.3
pycparser==2.19
pyOpenSSL==19.0.0
pyparsing==2.4.7
PySocks==1.7.1
python-dateutil==2.8.1
pytz==2019.3
requests==2.22.0
ruamel-yaml==0.15.46
scipy==1.3.3
six==1.13.0
tqdm==4.36.1
urllib3==1.24.2
zipp==3.1.0
3 changes: 3 additions & 0 deletions containers/scatac-csv-to-arrow/context/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
anndata==0.7.1
pyarrow==0.15.1
h5py==2.10.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"","umap.1","umap.2","cluster"
"CAT",-1,-1,0
"TAG",0,0,1
"ATG",1,1,2
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"datatype": "cell",
"version": "0.1.2",
"tree": [
{
"name": "Leiden Cluster",
"children": [
{
"name": "Cluster 0",
"set": [
"CAT"
]
},
{
"name": "Cluster 1",
"set": [
"TAG"
]
},
{
"name": "Cluster 2",
"set": [
"ATG"
]
}
]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"CAT": {"mappings": {"UMAP": [-1, -1]}, "factors": {"Leiden Cluster": "0"}},
"TAG": {"mappings": {"UMAP": [0, 0]}, "factors": {"Leiden Cluster": "1"}},
"ATG": {"mappings": {"UMAP": [1, 1]}, "factors": {"Leiden Cluster": "2"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
index,umap_x,umap_y,leiden
CAT,-1,-1,0
TAG,0,0,1
ATG,1,1,2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"Leiden Cluster": {"map": ["0", "1", "2"], "cells": {"CAT": 0, "TAG": 1, "ATG": 2}}}
18 changes: 18 additions & 0 deletions scatac-csv-to-arrow-manifest.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[
{
"pattern": "output/(.*)\\.arrow",
"description": "Input data relevant for visualization saved in columnar Apache Arrow format."
},
{
"pattern": "output/(.*)\\.csv",
"description": "Input data relevant for visualization saved in columnar comma-separated-file format."
},
{
"pattern": "output/(.*)\\.cells\\.json",
"description": "JSON-formatted information about this scATAC-seq run including scatterplot coordinates and clustering."
},
{
"pattern": "output/(.*)\\.factors\\.json",
"description": "JSON-formatted information about this scATAC-seq's clustering."
}
]
19 changes: 19 additions & 0 deletions scatac-csv-to-arrow.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env cwl-runner

cwlVersion: v1.0
class: CommandLineTool
# TODO: Make main.py executable?
baseCommand: ['python', '/main.py', '--output_dir', './output', '--input_dir']
hints:
DockerRequirement:
dockerPull: hubmap/portal-container-scatac-csv-to-arrow:0.0.1
inputs:
input_directory:
type: Directory
inputBinding:
position: 6
outputs:
output_directory:
type: Directory
outputBinding:
glob: output
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"","umap.1","umap.2","cluster"
"CAT",-1,-1,0
"TAG",0,0,1
"ATG",1,1,2
3 changes: 3 additions & 0 deletions workflows/scatac-csv-to-arrow/test-job.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
input_directory:
class: Directory
path: test-input
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"datatype": "cell",
"version": "0.1.2",
"tree": [
{
"name": "Leiden Cluster",
"children": [
{
"name": "Cluster 0",
"set": [
"CAT"
]
},
{
"name": "Cluster 1",
"set": [
"TAG"
]
},
{
"name": "Cluster 2",
"set": [
"ATG"
]
}
]
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{"CAT": {"mappings": {"UMAP": [-1, -1]}, "factors": {"Leiden Cluster": "0"}},
"TAG": {"mappings": {"UMAP": [0, 0]}, "factors": {"Leiden Cluster": "1"}},
"ATG": {"mappings": {"UMAP": [1, 1]}, "factors": {"Leiden Cluster": "2"}}}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
index,umap_x,umap_y,leiden
CAT,-1,-1,0
TAG,0,0,1
ATG,1,1,2
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"Leiden Cluster": {"map": ["0", "1", "2"], "cells": {"CAT": 0, "TAG": 1, "ATG": 2}}}