From 0b94a44c9ef9543ae3c5ec8d77907b4269c5208a Mon Sep 17 00:00:00 2001 From: Mark Keller <7525285+keller-mark@users.noreply.github.com> Date: Mon, 6 Mar 2023 10:52:52 -0500 Subject: [PATCH] Fix cluster name handling in atac-seq script (#118) * Update handling of cluster names * Bump version * Update --- containers/scatac-csv-to-arrow/VERSION | 2 +- .../scatac-csv-to-arrow/context/main.py | 19 +++++++++++++++---- scatac-csv-to-arrow.cwl | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/containers/scatac-csv-to-arrow/VERSION b/containers/scatac-csv-to-arrow/VERSION index 4e379d2..bcab45a 100644 --- a/containers/scatac-csv-to-arrow/VERSION +++ b/containers/scatac-csv-to-arrow/VERSION @@ -1 +1 @@ -0.0.2 +0.0.3 diff --git a/containers/scatac-csv-to-arrow/context/main.py b/containers/scatac-csv-to-arrow/context/main.py index 4b1b7b4..e7847e7 100644 --- a/containers/scatac-csv-to-arrow/context/main.py +++ b/containers/scatac-csv-to-arrow/context/main.py @@ -22,6 +22,14 @@ def arrow_to_csv(arrow_file, csv_file): df = pa.ipc.open_file(arrow_file).read_pandas() df.to_csv(csv_file) +def try_str_to_int(val): + try: + # Keep only numeric characters. + val_numeric = "".join(filter(str.isdigit, val)) + if len(val_numeric) > 0: + return int(val_numeric) + except: + return val # Big TODO: deduplicate this with h5ad-to-arrow def arrow_to_json(arrow_file, **kwargs): @@ -31,10 +39,14 @@ def arrow_to_json(arrow_file, **kwargs): df = pa.ipc.open_file(arrow_file).read_pandas() df_items = df.T.to_dict().items() + # It is possible for the cluster names to not be integers. + df['leiden'] = df['leiden'].astype(str) + leiden_clusters = sorted(df['leiden'].unique(), key=try_str_to_int) + id_to_umap = { k: { "mappings": {"UMAP": [v['umap_x'], v['umap_y']]}, - "factors": {"Leiden Cluster": str(int(v['leiden']))} + "factors": {"Leiden Cluster": str(v['leiden'])} } for (k,v) in df_items } @@ -42,11 +54,10 @@ def arrow_to_json(arrow_file, **kwargs): with open(umap_json, 'w') as f: f.write(pretty_json_umap) - leiden_clusters = sorted(df['leiden'].unique().astype('uint8')) id_to_factors = { 'Leiden Cluster': { - 'map': [str(cluster) for cluster in leiden_clusters], - 'cells': { k: v['leiden'] for (k,v) in df_items } + 'map': leiden_clusters, + 'cells': { k: leiden_clusters.index(str(v['leiden'])) for (k,v) in df_items } } } pretty_json_factors = json.dumps(id_to_factors).replace('}},', '}},\n') diff --git a/scatac-csv-to-arrow.cwl b/scatac-csv-to-arrow.cwl index 249ae5e..e7e4a43 100755 --- a/scatac-csv-to-arrow.cwl +++ b/scatac-csv-to-arrow.cwl @@ -6,7 +6,7 @@ class: CommandLineTool baseCommand: ['python', '/main.py', '--output_dir', './output', '--input_dir'] hints: DockerRequirement: - dockerPull: hubmap/portal-container-scatac-csv-to-arrow:0.0.1 + dockerPull: hubmap/portal-container-scatac-csv-to-arrow:0.0.3 inputs: input_directory: type: Directory