From 0b94a44c9ef9543ae3c5ec8d77907b4269c5208a Mon Sep 17 00:00:00 2001
From: Mark Keller <7525285+keller-mark@users.noreply.github.com>
Date: Mon, 6 Mar 2023 10:52:52 -0500
Subject: [PATCH] Fix cluster name handling in atac-seq script (#118)

* Update handling of cluster names

* Bump version

* Update
---
 containers/scatac-csv-to-arrow/VERSION        |  2 +-
 .../scatac-csv-to-arrow/context/main.py       | 19 +++++++++++++++----
 scatac-csv-to-arrow.cwl                       |  2 +-
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/containers/scatac-csv-to-arrow/VERSION b/containers/scatac-csv-to-arrow/VERSION
index 4e379d2..bcab45a 100644
--- a/containers/scatac-csv-to-arrow/VERSION
+++ b/containers/scatac-csv-to-arrow/VERSION
@@ -1 +1 @@
-0.0.2
+0.0.3
diff --git a/containers/scatac-csv-to-arrow/context/main.py b/containers/scatac-csv-to-arrow/context/main.py
index 4b1b7b4..e7847e7 100644
--- a/containers/scatac-csv-to-arrow/context/main.py
+++ b/containers/scatac-csv-to-arrow/context/main.py
@@ -22,6 +22,14 @@ def arrow_to_csv(arrow_file, csv_file):
     df = pa.ipc.open_file(arrow_file).read_pandas()
     df.to_csv(csv_file)
 
+def try_str_to_int(val):
+    try:
+        # Keep only numeric characters.
+        val_numeric = "".join(filter(str.isdigit, val))
+        if len(val_numeric) > 0:
+            return int(val_numeric)
+    except: 
+        return val
 
 # Big TODO: deduplicate this with h5ad-to-arrow
 def arrow_to_json(arrow_file, **kwargs):
@@ -31,10 +39,14 @@ def arrow_to_json(arrow_file, **kwargs):
     df = pa.ipc.open_file(arrow_file).read_pandas()
     df_items = df.T.to_dict().items()
 
+    # It is possible for the cluster names to not be integers.
+    df['leiden'] = df['leiden'].astype(str)
+    leiden_clusters = sorted(df['leiden'].unique(), key=try_str_to_int)
+
     id_to_umap = {
         k: {
             "mappings": {"UMAP": [v['umap_x'], v['umap_y']]},
-            "factors": {"Leiden Cluster": str(int(v['leiden']))}
+            "factors": {"Leiden Cluster": str(v['leiden'])}
         }
         for (k,v) in df_items
     }
@@ -42,11 +54,10 @@ def arrow_to_json(arrow_file, **kwargs):
     with open(umap_json, 'w') as f:
         f.write(pretty_json_umap)
 
-    leiden_clusters = sorted(df['leiden'].unique().astype('uint8'))
     id_to_factors = {
         'Leiden Cluster': {
-            'map': [str(cluster) for cluster in leiden_clusters],
-            'cells': { k: v['leiden'] for (k,v) in df_items }
+            'map': leiden_clusters,
+            'cells': { k: leiden_clusters.index(str(v['leiden'])) for (k,v) in df_items }
         }
     }
     pretty_json_factors = json.dumps(id_to_factors).replace('}},', '}},\n')
diff --git a/scatac-csv-to-arrow.cwl b/scatac-csv-to-arrow.cwl
index 249ae5e..e7e4a43 100755
--- a/scatac-csv-to-arrow.cwl
+++ b/scatac-csv-to-arrow.cwl
@@ -6,7 +6,7 @@ class: CommandLineTool
 baseCommand: ['python', '/main.py', '--output_dir', './output', '--input_dir']
 hints:
   DockerRequirement:
-    dockerPull: hubmap/portal-container-scatac-csv-to-arrow:0.0.1
+    dockerPull: hubmap/portal-container-scatac-csv-to-arrow:0.0.3
 inputs:
   input_directory:
     type: Directory