From 83adae3d3100995c8d5835a125a8f4d2bd83537d Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 4 Aug 2020 18:15:18 +0100
Subject: [PATCH 01/29] Correct scatter function usage and normalisation step

---
 scanpy-scripts-tests.bats     |  2 +-
 scanpy_scripts/cmd_options.py | 42 ++++++++++++++++++++++++++++++-----
 scanpy_scripts/cmd_utils.py   |  2 +-
 3 files changed, 39 insertions(+), 7 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index fe1f78d6..07757cdb 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -14,7 +14,7 @@ setup() {
     filter_opt="-p n_genes 200 2500 -p c:n_counts 0 50000 -p n_cells 3 inf -p pct_counts_mito 0 0.2 -c mito '!True' --show-obj stdout"
     filter_obj="${output_dir}/filter.h5ad"
     norm_mtx="${output_dir}/norm"
-    norm_opt="-r yes -t 10000 -X ${norm_mtx} --show-obj stdout"
+    norm_opt="-r yes -t 10000 -l all -n after -X ${norm_mtx} --show-obj stdout"
     norm_obj="${output_dir}/norm.h5ad"
     hvg_opt="-m 0.0125 3 -d 0.5 inf -s --show-obj stdout"
     hvg_obj="${output_dir}/hvg.h5ad"
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index 39febb2d..d181d67b 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -565,6 +565,7 @@
     'norm': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
+        COMMON_OPTIONS['key_added'],
         click.option(
             '--save-raw', '-r',
             type=click.Choice(['yes', 'no', 'counts']),
@@ -587,13 +588,44 @@
             help='Normalize per cell nUMI to this number.',
         ),
         click.option(
-            '--fraction',
+            '--exclude-highly-expressed', '-e', 'exclude_highly_expressed',
+            is_flag=True,
+            default=False,
+            show_default=True,
+            help='Exclude (very) highly expressed genes for the computation of '
+            'the normalization factor (size factor) for each cell. A gene is considered '
+            'highly expressed, if it has more than max_fraction of the total counts in at '
+            'least one cell. The not-excluded genes will sum up to the number '
+            'specified by --normalize-to.'
+        ),
+        click.option(
+            '--max-fraction', '-m', 'max_fraction',
             type=float,
-            default=0.9,
+            default=0.05,
+            show_default=True,
+            help='If exclude_highly_expressed=True, consider cells as highly '
+            'expressed that have more counts than max_fraction of the original total counts '
+            'in at least one cell.'
+        ),
+        click.option(
+            '--layers', '-l',
+            type=CommaSeparatedText(simplify=True),
+            default=None,
+            show_default=True,
+            help="List of layers to normalize. Set to 'all' to normalize all layers."
+        ),
+        click.option(
+            '--layer-norm', '-n', 'layer_norm',
+            type=click.Choice(['after', 'X']),
+            default=None,
             show_default=True,
-            help='Only use genes that make up less than this fraction of the total '
-            'count in every cell. So only these genes will sum up to the number '
-            'specified by --normalize-to.',
+            help="Specifies how to normalize layers: 1) If None, after "
+            "normalization, for each layer in layers each cell has a total count equal to "
+            "the median of the counts_per_cell before normalization of the layer. 2) If "
+            "'after', for each layer in layers each cell has a total count equal to "
+            "target_sum. 3) If 'X', for each layer in layers each cell has a total count "
+            "equal to the median of total counts for observations (cells) of adata.X before "
+            "normalization.'"
         ),
     ],
 
diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py
index a1e88f2b..df2b7ca1 100644
--- a/scanpy_scripts/cmd_utils.py
+++ b/scanpy_scripts/cmd_utils.py
@@ -159,7 +159,7 @@ def make_plot_function(func_name, kind=None):
     # Provide a function translation
 
     plot_funcs = {
-        'scatter': sc.plotting._tools.scatterplots.plot_scatter,
+        'scatter': sc.pl.scatter,
         'sviol': sc.pl.stacked_violin,
         'rgg_sviol': sc.pl.rank_genes_groups_stacked_violin,
         'dot': sc.pl.dotplot,

From c3f4d644d23bb725d0ce92a4c1756db83fa99128 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 4 Aug 2020 18:20:08 +0100
Subject: [PATCH 02/29] Relax most pins

---
 setup.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/setup.py b/setup.py
index 336fa217..536cc5e4 100644
--- a/setup.py
+++ b/setup.py
@@ -36,12 +36,12 @@
     ),
     install_requires=[
         'packaging',
-        'anndata<0.6.20',
-        'scipy>=1.2.0,<1.3.0',
+        'anndata',
+        'scipy',
         'matplotlib',
         'pandas',
-        'h5py<2.10',
-        'scanpy>=1.4.2,<1.4.4',
+        'h5py',
+        'scanpy>=1.5.1',
         'louvain',
         'leidenalg',
         'loompy>=2.0.0,<3.0.0',

From 38701f1799dfcb03fa7a38050faaa29fa2627e57 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Wed, 5 Aug 2020 10:55:13 +0100
Subject: [PATCH 03/29] Add some missing options

---
 scanpy-scripts-tests.bats     |  2 +-
 scanpy_scripts/cmd_options.py | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 07757cdb..ab2ecca8 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -25,7 +25,7 @@ setup() {
     pca_embed="${output_dir}/pca.tsv"
     pca_opt="--n-comps 50 -V auto --show-obj stdout -E ${pca_embed}"
     pca_obj="${output_dir}/pca.h5ad"
-    neighbor_opt="-k 5,10,20 -n 25 -m umap --show-obj stdout"
+    neighbor_opt="-k 5,10,20 -n 25 -m umap -t euclidean --show-obj stdout"
     neighbor_obj="${output_dir}/neighbor.h5ad"
     tsne_embed="${output_dir}/tsne.tsv"
     tsne_opt="-n 25 --use-rep X_pca --learning-rate 200 -E ${tsne_embed}"
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index d181d67b..0078e850 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -700,6 +700,12 @@
             help='When specified, clip to this value after scaling, otherwise do '
             'not clip',
         ),
+        click.option(
+            '--layer', '-l',
+            type=CommaSeparatedText(simplify=True),
+            default=None,
+            help="If provided, which element of layers to scale."
+        ),
     ],
 
     'regress': [
@@ -788,11 +794,19 @@
         ),
         click.option(
             '--method', '-m',
-            type=click.Choice(['umap', 'gauss']),
+            type=click.Choice(['umap', 'gauss', 'rapids']),
             default='umap',
             show_default=True,
             help='Use umap or gauss with adaptive width for computing '
-            'connectivities.'
+            'connectivities. Use rapids for the RAPIDS implementation of UMAP '
+            '(experimental, GPU only).'
+        ),
+        click.option(
+            '--metric', '-t',
+            type=click.Choice(['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']),
+            default='euclidean',
+            show_default=True,
+            help='A known metric’s name.'
         ),
     ],
 

From 10171a5d8b56bf4f506f8d4d26fd07a6fada037f Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Wed, 5 Aug 2020 11:00:42 +0100
Subject: [PATCH 04/29] Add method argument for umap

---
 scanpy_scripts/cmd_options.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index 0078e850..208b6518 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -879,6 +879,14 @@
             help='The number of negative edge samples to use per positive edge '
             'sample in optimizing the low dimensional embedding.',
         ),
+        click.option(
+            '--method',
+            type=click.Choice(['umap', 'rapids']),
+            default='umap',
+            show_default=True,
+            help='Use the original ‘umap’ implementation, or ‘rapids’ '
+            '(experimental, GPU only).'
+        ),
     ],
 
     'tsne': [

From f4a0b6b359bef81608657cb12c0e157da0e6b356 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Wed, 5 Aug 2020 18:19:02 +0100
Subject: [PATCH 05/29] Fix fdg call to use scanpy-native functionality for
 slot specification

---
 scanpy-scripts-tests.bats     |  2 +-
 scanpy_scripts/cmd_options.py | 41 +++++++++++++++++++++++++++++++----
 scanpy_scripts/lib/_fdg.py    | 21 ++++--------------
 3 files changed, 42 insertions(+), 22 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index ab2ecca8..43abf11b 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -34,7 +34,7 @@ setup() {
     umap_opt="--use-graph neighbors_k10 --min-dist 0.75 --alpha 1 --gamma 1 -E ${umap_embed}"
     umap_obj="${output_dir}/umap.h5ad"
     fdg_embed="${output_dir}/fdg.tsv"
-    fdg_opt="--use-graph neighbors_k10 --layout fr -E ${fdg_embed}"
+    fdg_opt="--neighbors-key neighbors_k10 --layout fr -E ${fdg_embed}"
     fdg_obj="${output_dir}/fdg.h5ad"
     louvain_opt="-r 0.5,1 --use-graph neighbors_k10 --key-added k10"
     louvain_obj="${output_dir}/louvain.h5ad"
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index 208b6518..de3149a9 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -456,7 +456,15 @@
             show_default=True,
             help='Number of genes to show.'
         ),
-    ]
+    ],
+
+    'root': click.option(
+        '--root',
+        type=click.INT,
+        default=0,
+        show_default=True,
+        help='If choosing a tree layout, this is the index of the root node.',
+    ),
 }
 
 CMD_OPTIONS = {
@@ -946,10 +954,9 @@
     'fdg': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
-        COMMON_OPTIONS['knn_graph'][0], # --use-graph
         COMMON_OPTIONS['random_state'],
-        COMMON_OPTIONS['key_added'],
         COMMON_OPTIONS['export_embedding'],
+        COMMON_OPTIONS['root'],
         click.option(
             '--init-pos',
             type=click.STRING,
@@ -959,7 +966,7 @@
         ),
         click.option(
             '--layout',
-            type=click.Choice(['fa', 'fr', 'grid_fr', 'kk', 'lgl', 'drl', 'rt']),
+            type=click.Choice(['fa', 'fr', 'grid_fr', 'kk', 'lgl', 'drl', 'rt', 'rt_circular']),
             default='fa',
             show_default=True,
             help='Name of any valid igraph layout, including "fa" (ForceAtlas2), '
@@ -968,6 +975,13 @@
             '(Large Graph Layout, very fast), "drl" (Distributed Recursive Layout, '
             'pretty fast) and "rt" (Reingold Tilford tree layout).',
         ),
+        click.option(
+            '--key-added-ext',
+            type=click.STRING,
+            default=None,
+            show_default=True,
+            help="By default, append 'layout'"
+        ),
         click.option(
             '--init-pos',
             type=click.STRING,
@@ -976,6 +990,24 @@
             help='How to initialize the low dimensional embedding. Can be "paga", '
             'or any valid key of `.obsm`.',
         ),
+        click.option(
+            '--neighbors-key',
+            type=click.STRING,
+            default=None,
+            show_default=True,
+            help='If not specified, draw_graph looks .obsp[‘connectivities’] '
+            'for connectivities (default storage place for pp.neighbors). If specified, '
+            'draw_graph looks .obsp[.uns[neighbors_key][‘connectivities_key’]] for '
+            'connectivities.'
+        ),
+        click.option(
+            '--obsp',
+            type=click.STRING,
+            default=None,
+            show_default=True,
+            help='Use .obsp[obsp] as adjacency. You can’t specify both obsp and '
+            'neighbors_key at the same time.'
+        ),
     ],
 
     'louvain': [
@@ -1272,6 +1304,7 @@
             help='Do not draw edges for weights below this threshold. Set to 0 to '
             'include all edges.',
         ),
+        COMMON_OPTIONS['root'],
         click.option(
             '--root',
             type=click.INT,
diff --git a/scanpy_scripts/lib/_fdg.py b/scanpy_scripts/lib/_fdg.py
index 91270f76..68bf95d9 100644
--- a/scanpy_scripts/lib/_fdg.py
+++ b/scanpy_scripts/lib/_fdg.py
@@ -15,7 +15,7 @@ def fdg(
         adata,
         use_graph='neighbors',
         layout='fa',
-        key_added=None,
+        key_added_ext=None,
         random_state=0,
         export_embedding=None,
         **kwargs
@@ -24,29 +24,16 @@ def fdg(
     Wrapper function for sc.tl.draw_graph, for supporting named slot of fdg
     embeddings.
     """
-    adj_mat = None
-    if use_graph != 'neighbors':
-        if use_graph not in adata.uns.keys():
-            raise KeyError(f'"{use_graph}" is not a valid key of `.uns`.')
-        adj_mat = adata.uns[use_graph]['connectivities']
-
-    _backup_obsm_key(adata, f'X_draw_graph_{layout}')
-
     sc.tl.draw_graph(
         adata,
         layout=layout,
+        key_added_ext=key_added_ext,
         random_state=random_state,
-        adjacency=adj_mat,
         **kwargs,
     )
 
-    fdg_key = f'X_draw_graph_{layout}'
-    if key_added:
-        fdg_key = f'X_draw_graph_{layout}_{key_added}'
-        _rename_obsm_key(adata, f'X_draw_graph_{layout}', fdg_key)
-    else:
-        _delete_obsm_backup_key(adata, f'X_draw_graph_{layout}')
+    fdg_key  = f'X_draw_graph_{key_added_ext or layout}'
 
     if export_embedding is not None:
-        write_embedding(adata, fdg_key, export_embedding, key_added=key_added)
+        write_embedding(adata, fdg_key, export_embedding, key_added=key_added_ext)
     return adata

From 155e35685b77de7811fc769127a06f6d0be8551f Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Thu, 6 Aug 2020 14:26:22 +0100
Subject: [PATCH 06/29] Use Scanpy-native option for graph specification in
 umap, move associated fields to common

---
 scanpy-scripts-tests.bats     |  2 +-
 scanpy_scripts/cmd_options.py | 43 ++++++++++++++++-------------------
 scanpy_scripts/lib/_umap.py   |  4 ----
 3 files changed, 21 insertions(+), 28 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 43abf11b..966553a5 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -31,7 +31,7 @@ setup() {
     tsne_opt="-n 25 --use-rep X_pca --learning-rate 200 -E ${tsne_embed}"
     tsne_obj="${output_dir}/tsne.h5ad"
     umap_embed="${output_dir}/umap.tsv"
-    umap_opt="--use-graph neighbors_k10 --min-dist 0.75 --alpha 1 --gamma 1 -E ${umap_embed}"
+    umap_opt="--neighbors-key neighbors_k10 --min-dist 0.75 --alpha 1 --gamma 1 -E ${umap_embed}"
     umap_obj="${output_dir}/umap.h5ad"
     fdg_embed="${output_dir}/fdg.tsv"
     fdg_opt="--neighbors-key neighbors_k10 --layout fr -E ${fdg_embed}"
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index de3149a9..feab4e09 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -135,12 +135,25 @@
 
     'knn_graph': [
         click.option(
-            '--use-graph',
+            '--neighbors-key',
+            type=click.STRING,
+            default=None,
+            show_default=False,
+            help='If not specified, look in .uns[‘neighbors’] for neighbors '
+            'settings and .obsp[‘connectivities’], .obsp[‘distances’] for connectivities and '
+            'distances respectively (default storage places for pp.neighbors). If specified, '
+            'look in .uns[neighbors_key] for neighbors settings and '
+            '.obsp[.uns[neighbors_key][‘connectivities_key’]], '
+            '.obsp[.uns[neighbors_key][‘distances_key’]] for connectivities and distances '
+            'respectively.'
+        ),
+        click.option(
+            '--obsp',
             type=click.STRING,
-            default='neighbors',
+            default=None,
             show_default=True,
-            help='Slot name under `.uns` that contains the KNN graph of which '
-            'sparse adjacency matrix is used for clustering.',
+            help='Use .obsp[obsp] as adjacency. You can’t specify both obsp and '
+            'neighbors_key at the same time.'
         ),
         click.option(
             '--directed/--undirected', 'directed',
@@ -821,7 +834,7 @@
     'umap': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
-        COMMON_OPTIONS['knn_graph'][0], # --use-graph
+        COMMON_OPTIONS['knn_graph'][0], # --neighbors-key
         COMMON_OPTIONS['random_state'],
         COMMON_OPTIONS['key_added'],
         COMMON_OPTIONS['export_embedding'],
@@ -990,24 +1003,8 @@
             help='How to initialize the low dimensional embedding. Can be "paga", '
             'or any valid key of `.obsm`.',
         ),
-        click.option(
-            '--neighbors-key',
-            type=click.STRING,
-            default=None,
-            show_default=True,
-            help='If not specified, draw_graph looks .obsp[‘connectivities’] '
-            'for connectivities (default storage place for pp.neighbors). If specified, '
-            'draw_graph looks .obsp[.uns[neighbors_key][‘connectivities_key’]] for '
-            'connectivities.'
-        ),
-        click.option(
-            '--obsp',
-            type=click.STRING,
-            default=None,
-            show_default=True,
-            help='Use .obsp[obsp] as adjacency. You can’t specify both obsp and '
-            'neighbors_key at the same time.'
-        ),
+        COMMON_OPTIONS['knn_graph'][0], # --neighbors-key
+        COMMON_OPTIONS['knn_graph'][1], # --obsp
     ],
 
     'louvain': [
diff --git a/scanpy_scripts/lib/_umap.py b/scanpy_scripts/lib/_umap.py
index 0131c0f5..197edf83 100644
--- a/scanpy_scripts/lib/_umap.py
+++ b/scanpy_scripts/lib/_umap.py
@@ -14,7 +14,6 @@
 
 def umap(
         adata,
-        use_graph='neighbors',
         key_added=None,
         random_state=0,
         export_embedding=None,
@@ -23,7 +22,6 @@ def umap(
     """
     Wrapper function for sc.tl.umap, for supporting named slot of umap embeddings
     """
-    _set_default_key(adata.uns, 'neighbors', use_graph)
     if not isinstance(random_state, (list, tuple)):
         _backup_obsm_key(adata, 'X_umap')
 
@@ -51,10 +49,8 @@ def umap(
                                  'iterable of the same length as `random_state`.')
             umap(
                 adata,
-                use_graph='neighbors',
                 key_added=umap_key,
                 random_state=rseed,
                 **kwargs,
             )
-    _restore_default_key(adata.uns, 'neighbors', use_graph)
     return adata

From a36a7e5ac47e400750ed1984543c2988514a862d Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Thu, 6 Aug 2020 15:24:21 +0100
Subject: [PATCH 07/29] Remove legacy snippet

---
 scanpy_scripts/lib/_fdg.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scanpy_scripts/lib/_fdg.py b/scanpy_scripts/lib/_fdg.py
index 68bf95d9..6fbdb213 100644
--- a/scanpy_scripts/lib/_fdg.py
+++ b/scanpy_scripts/lib/_fdg.py
@@ -13,7 +13,6 @@
 
 def fdg(
         adata,
-        use_graph='neighbors',
         layout='fa',
         key_added_ext=None,
         random_state=0,

From 45f75c06d303c1cabbbdbd634d2909c337ac38af Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Thu, 6 Aug 2020 15:25:08 +0100
Subject: [PATCH 08/29] Switch louvain to native neighors_key/ obsp for slot
 selection

---
 scanpy-scripts-tests.bats      |  3 ++-
 scanpy_scripts/lib/_louvain.py | 23 ++++++++++++-----------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 966553a5..88c687ea 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -36,7 +36,8 @@ setup() {
     fdg_embed="${output_dir}/fdg.tsv"
     fdg_opt="--neighbors-key neighbors_k10 --layout fr -E ${fdg_embed}"
     fdg_obj="${output_dir}/fdg.h5ad"
-    louvain_opt="-r 0.5,1 --use-graph neighbors_k10 --key-added k10"
+    louvain_tsv="${output_dir}/louvain.tsv"
+    louvain_opt="-r 0.5,1 --neighbors-key neighbors_k10 --key-added k10 --export-cluster ${louvain_tsv}"
     louvain_obj="${output_dir}/louvain.h5ad"
     leiden_tsv="${output_dir}/leiden.tsv"
     leiden_opt="-r 0.3,0.7 --use-graph neighbors_k10 --key-added k10 -F loom --export-cluster ${leiden_tsv}"
diff --git a/scanpy_scripts/lib/_louvain.py b/scanpy_scripts/lib/_louvain.py
index 9d044355..3130da8c 100644
--- a/scanpy_scripts/lib/_louvain.py
+++ b/scanpy_scripts/lib/_louvain.py
@@ -9,7 +9,8 @@
 def louvain(
         adata,
         resolution,
-        use_graph=None,
+        neighbors_key=None,
+        obsp=None,
         key_added=None,
         export_cluster=None,
         **kwargs
@@ -20,11 +21,7 @@ def louvain(
     keys = []
     if kwargs['restrict_to'] and not kwargs['restrict_to'][0]:
         kwargs['restrict_to'] = None
-    adj_mat = None
-    if use_graph:
-        if use_graph not in adata.uns:
-            raise KeyError(f'"{use_graph}" is not a valid key of `.uns`.')
-        adj_mat = adata.uns[use_graph]['connectivities']
+    
     if not isinstance(resolution, (list, tuple)):
         if key_added is not None and not key_added.startswith('louvain_'):
             key_added = f'louvain_{key_added}'
@@ -33,19 +30,22 @@ def louvain(
         sc.tl.louvain(
             adata,
             resolution=resolution,
-            adjacency=adj_mat,
             key_added=key_added,
+            neighbors_key=neighbors_key,
+            obsp=obsp,
             **kwargs
         )
         keys.append(key_added)
     else:
         for i, res in enumerate(resolution):
+
             res_key = str(res).replace('.', '_')
+
             if key_added is None:
-                graph_key = ('_' + use_graph) if use_graph else ''
-                key = f'louvain{graph_key}_r{res_key}'
+                graph_key = ('_' + f'{neighbors_key or obsp}') if neighbors or obsp else ''
+                key = f'louvain{graph_key}_r{res_key}'    
             elif not isinstance(key_added, (list, tuple)):
-                key = 'louvain_{key_added}_r{res_key}'
+                key = f'louvain_{key_added}_r{res_key}'
             elif len(key_added) == len(resolution):
                 key = key_added[i]
             else:
@@ -54,7 +54,8 @@ def louvain(
             keys.extend(louvain(
                 adata,
                 resolution=res,
-                use_graph=use_graph,
+                neighbors_key=neighbors_key,
+                obsp=obsp,
                 key_added=key,
                 **kwargs,
             ))

From 5f051158f1a1e5008642dbf08a2727702956e2d1 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Thu, 6 Aug 2020 16:11:00 +0100
Subject: [PATCH 09/29] Switch leiden to native neighors_key/ obsp for slot
 selection, remove exhangeable Loom (incompatible with latest annData) and use
 native non-exchangeable Scanpy Loom export instead

---
 scanpy-scripts-tests.bats           |   2 +-
 scanpy_scripts/cmd_options.py       |   7 +
 scanpy_scripts/cmd_utils.py         |   8 +-
 scanpy_scripts/exchangeable_loom.py | 368 ----------------------------
 scanpy_scripts/lib/_leiden.py       |  17 +-
 5 files changed, 21 insertions(+), 381 deletions(-)
 delete mode 100644 scanpy_scripts/exchangeable_loom.py

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 88c687ea..1960e887 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -40,7 +40,7 @@ setup() {
     louvain_opt="-r 0.5,1 --neighbors-key neighbors_k10 --key-added k10 --export-cluster ${louvain_tsv}"
     louvain_obj="${output_dir}/louvain.h5ad"
     leiden_tsv="${output_dir}/leiden.tsv"
-    leiden_opt="-r 0.3,0.7 --use-graph neighbors_k10 --key-added k10 -F loom --export-cluster ${leiden_tsv}"
+    leiden_opt="-r 0.3,0.7 --neighbors-key neighbors_k10 --key-added k10 -F loom --loom-write-obsm-varm --export-cluster ${leiden_tsv}"
     leiden_obj="${output_dir}/leiden.loom"
     diffexp_tsv="${output_dir}/diffexp.tsv"
     diffexp_opt="-g leiden_k10_r0_7 --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv} -f loom"
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index feab4e09..d2daa9ff 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -48,6 +48,13 @@
             show_default=True,
             help='Chunk size for writing output in zarr format.',
         ),
+        click.option(
+            '--loom-write-obsm-varm', '-b',
+            is_flag=True,
+            default=False,
+            show_default=True,
+            help='Write obsm and varm to the Loom file?',
+        ),
         click.option(
             '--export-mtx', '-X',
             type=click.Path(dir_okay=True, writable=True),
diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py
index df2b7ca1..81e691b8 100644
--- a/scanpy_scripts/cmd_utils.py
+++ b/scanpy_scripts/cmd_utils.py
@@ -5,7 +5,6 @@
 import click
 import pandas as pd
 import scanpy as sc
-from .exchangeable_loom import read_exchangeable_loom, write_exchangeable_loom
 from .cmd_options import CMD_OPTIONS
 from .lib._paga import plot_paga
 
@@ -33,6 +32,7 @@ def cmd(
             input_format=None,
             output_format=None,
             zarr_chunk_size=None,
+            loom_write_obsm_varm=False,
             export_mtx=None,
             show_obj=None,
             **kwargs
@@ -50,6 +50,7 @@ def cmd(
                 output_obj,
                 output_format=output_format,
                 chunk_size=zarr_chunk_size,
+                write_obsm_varm=loom_write_obsm_varm,
                 export_mtx=export_mtx,
                 show_obj=show_obj,
             )
@@ -73,7 +74,7 @@ def _read_obj(input_obj, input_format='anndata', **kwargs):
     if input_format == 'anndata':
         adata = sc.read(input_obj, **kwargs)
     elif input_format == 'loom':
-        adata = read_exchangeable_loom(input_obj, **kwargs)
+        adata = sc.read_loom(input_obj, **kwargs)
     else:
         raise NotImplementedError(
             'Unsupported input format: {}'.format(input_format))
@@ -87,12 +88,13 @@ def _write_obj(
         chunk_size=None,
         export_mtx=None,
         show_obj=None,
+        write_obsm_varm=False,
         **kwargs
 ):
     if output_format == 'anndata':
         adata.write(output_obj, compression='gzip')
     elif output_format == 'loom':
-        write_exchangeable_loom(adata, output_obj, **kwargs)
+        adata.write_loom(output_obj, write_obsm_varm=write_obsm_varm )
     elif output_format == 'zarr':
         adata.write_zarr(output_obj, chunk_size=chunk_size, **kwargs)
     else:
diff --git a/scanpy_scripts/exchangeable_loom.py b/scanpy_scripts/exchangeable_loom.py
deleted file mode 100644
index 06d7af78..00000000
--- a/scanpy_scripts/exchangeable_loom.py
+++ /dev/null
@@ -1,368 +0,0 @@
-#!/usr/bin/env python
-"""exchangeable_loom
-
-The exchangeable Loom format is based on Loom spec 3.0.0 and is defined
-by the following minimum structure:
-
-    /.attrs['LOOM_SPEC_VERSION'] = '3.0.0'
-    /attr
-    /attr/manifest = table(columns=['loom_path', 'dtype', ...])
-    /matrix
-    /layers
-    /col_attrs
-    /col_graphs
-    /row_attrs
-    /row_graphs
-
-For conversion between AnnData and exchangeable Loom, anndata.obsm, anndata.varm
-and non-scalar value of anndata.uns are stored as datasets under /attr, while
-scalar value of anndata.uns are store at global attributes at /.attrs to make it
-backward-compatible with Loom 2.0.1 and AnnData.
-
-* Provides
-
-    Read and write functions for conversion between AnnData and exchangeable
-    Loom.
-"""
-
-import logging
-import anndata
-import h5py
-import numpy as np
-import pandas as pd
-import scipy.sparse as sp
-from packaging import version
-
-
-EXCHANGEABLE_LOOM_VERSION = '3.0.0'
-
-
-def _h5_read_attrs(node, name):
-    data = node.attrs[name]
-    if isinstance(data, np.ndarray) and len(data) == 1:
-        data = data[0]
-    return data
-
-
-def _h5_read_coo_matrix(node):
-    shape = tuple(map(int, node.attrs['shape'].decode().split(',')))
-    return sp.coo_matrix((node['w'], (node['a'], node['b'])), shape=shape)
-
-
-def _h5_write_coo_matrix(root, path, graph):
-    if path not in root:
-        root.create_group(path)
-    graph_node = root[path]
-    if not isinstance(graph_node, h5py.Group):
-        raise ValueError(path)
-    row_path = '{}/{}'.format(path, 'a')
-    col_path = '{}/{}'.format(path, 'b')
-    data_path = '{}/{}'.format(path, 'w')
-    if row_path in root:
-        del root[row_path]
-    if col_path in root:
-        del root[col_path]
-    if data_path in root:
-        del root[data_path]
-    root.create_dataset(row_path, data=graph.row)
-    root.create_dataset(col_path, data=graph.col)
-    root.create_dataset(data_path, data=graph.data)
-    graph_node.attrs['shape'] = (','.join(map(str, graph.shape))).encode()
-    return 0
-
-
-def _h5_read_csr_matrix(node):
-    shape = tuple(map(int, node.attrs['shape'].decode().split(',')))
-    return sp.csr_matrix((node['data'], node['indices'], node['indptr']), shape=shape)
-
-
-def _h5_write_csr_matrix(root, path, matrix):
-    if path not in root:
-        root.create_group(path)
-    graph_node = root[path]
-    if not isinstance(graph_node, h5py.Group):
-        raise ValueError(path)
-    data_path = '{}/{}'.format(path, 'data')
-    indices_path = '{}/{}'.format(path, 'indices')
-    indptr_path = '{}/{}'.format(path, 'indptr')
-    if data_path in root:
-        del root[data_path]
-    if indices_path in root:
-        del root[indices_path]
-    if indptr_path in root:
-        del root[indptr_path]
-    root.create_dataset(data_path, data=matrix.data)
-    root.create_dataset(indices_path, data=matrix.indices)
-    root.create_dataset(indptr_path, data=matrix.indptr)
-    graph_node.attrs['shape'] = (','.join(map(str, matrix.shape))).encode()
-    return 0
-
-
-def _h5_write_recursive_dictionary(
-        data,
-        path,
-        record,
-        attr_root=None,
-        dataset_root=None,
-        graph_root=None,
-):
-    if isinstance(data, dict):
-        for child in list(data.keys()):
-            _h5_write_recursive_dictionary(
-                data[child],
-                '{}__{}'.format(path, child),
-                record,
-                attr_root=attr_root,
-                dataset_root=dataset_root,
-                graph_root=graph_root,
-            )
-    elif isinstance(data, (bytes, str, int, float, np.number, np.character)):
-        dtype = type(data)
-        if attr_root:
-            if isinstance(data, str):
-                data = data.encode()
-            attr_root.attrs[path] = data
-            record.append('{}.attrs[{}]::scalar'.format(attr_root.name, path))
-        else:
-            logging.warning('Ignoring attr {} ({})'.format(path, dtype))
-    elif isinstance(data, (tuple, list, np.ndarray)):
-        dtype = type(data)
-        if dataset_root:
-            if not isinstance(data, np.ndarray):
-                data = np.array(data)
-            # Convert to bytes if unicode string
-            if data.dtype.kind == 'U':
-                data = data.astype(np.character)
-            if path in dataset_root:
-                del dataset_root[path]
-            dataset_root.create_dataset(path, data=data)
-            record.append('{}/{}::array'.format(dataset_root.name, path))
-        else:
-            logging.warning('Ignoring dataset {} ({})'.format(path, dtype))
-    elif isinstance(data, sp.csr_matrix):
-        dtype = type(data)
-        if graph_root:
-            _h5_write_coo_matrix(graph_root, path, sp.coo_matrix(data))
-            record.append('{}/{}::graph'.format(graph_root.name, path))
-        else:
-            logging.warning('Ignoring graph {} ({})'.format(path, dtype))
-    else:
-        logging.warning('Ignoring {} ({})'.format(path, dtype))
-
-
-def _is_exchangeable_loom(filename):
-    with h5py.File(filename, mode='r') as lm:
-        try:
-            loom_version = _h5_read_attrs(lm, 'LOOM_SPEC_VERSION').decode()
-        except Exception:
-            loom_version = '2.0.0'
-        return version.parse(loom_version) >= version.parse(EXCHANGEABLE_LOOM_VERSION)
-
-
-def _read_manifest(h5file):
-    return pd.DataFrame(
-        np.array(h5file['attr']['manifest']).astype(str),
-        columns=['loom_path', 'dtype', 'anndata_path', 'sce_path'],
-    )
-
-
-def read_exchangeable_loom(filename, sparse=True):
-    """Read exchangeable Loom
-
-    * Parameters
-        + filename : str
-        Path of the input exchangeable Loom file
-
-    * Returns
-        + adata : AnnData
-        An AnnData object
-    """
-    # Use anndata to read matrix, obs and var
-    adata = anndata.read_loom(filename, sparse=sparse, validate=False)
-
-    if not _is_exchangeable_loom(filename):
-        return adata
-
-    # Use h5py to read the rest
-    with h5py.File(filename, mode='r') as lm:
-        manifest = _read_manifest(lm)
-        for i, row in manifest.iterrows():
-            anndata_path = row['anndata_path']
-            loom_path = row['loom_path']
-            dtype = row['dtype']
-            if anndata_path and loom_path:
-                # Get data from loom_path
-                if loom_path.startswith('/.attrs['):
-                    lm_path = loom_path[8:-1] # remove '/.attrs['
-                    data = _h5_read_attrs(lm, lm_path)
-                else:
-                    data = lm[loom_path]
-                # Type conversion according to dtype
-                if dtype == 'array':
-                    data = np.array(data)
-                    # Convert to unicode string if bytes
-                    if data.dtype.kind == 'S':
-                        data = data.astype(str)
-                elif dtype == 'graph':
-                    data = sp.csr_matrix(_h5_read_coo_matrix(data))
-                elif dtype == 'scalar':
-                    if isinstance(data, (h5py.Dataset, np.ndarray)):
-                        data = data[0]
-                    if isinstance(data, bytes):
-                        data = data.decode()
-                elif dtype == 'csr_matrix':
-                    data = _h5_read_csr_matrix(data)
-                elif dtype == 'df':
-                    data = pd.DataFrame(data)
-
-                # Put data to the right location according to anndata_path
-                if anndata_path.startswith('/uns'):
-                    # For .uns, write recursive dictionary as necessary
-                    attr = adata.uns
-                    paths = anndata_path[5:].split('/')
-                    n_path = len(paths)
-                    for i, path in enumerate(paths):
-                        if i == n_path - 1:
-                            attr[path] = data
-                        else:
-                            if path not in attr:
-                                attr[path] = {}
-                            attr = attr[path]
-                elif anndata_path.startswith('/obsm'):
-                    attr = anndata_path[6:]
-                    adata.obsm[attr] = data
-                elif anndata_path.startswith('/varm'):
-                    attr = anndata_path[6:]
-                    adata.varm[attr] = data
-                elif anndata_path == '/raw.X':
-                    if adata.raw is None:
-                        adata.raw = anndata.AnnData(X=data)
-                    else:
-                        adata.raw.X = data
-                elif anndata_path == '/raw.var':
-                    adata.raw.var.index = data
-                else:
-                    logging.warning('Unexpected anndata path: {}'.format(anndata_path))
-    return adata
-
-
-def write_exchangeable_loom(adata, filename, col_graphs=['neighbors']):
-    """Write an AnnData object to an exchangeable Loom
-
-    * Parameters
-        + adata : AnnData
-        An AnnData object
-        + filename : str
-        Path of the output exchangeable Loom file
-    """
-    adata.write_loom(filename)
-    manifest = {'loom': [], 'dtype': [], 'anndata': [], 'sce': []}
-    with h5py.File(filename, mode='r+') as lm:
-        # Write modified LOOM_SPEC_VERSION
-        lm.attrs['LOOM_SPEC_VERSION'] = EXCHANGEABLE_LOOM_VERSION.encode()
-
-        # Write creation/modification info
-        if 'created_from' not in lm.attrs:
-            lm.attrs['created_from'] = 'anndata'
-        lm.attrs['last_modified_by'] = 'scanpy'
-
-        # Record which columns are used as colnames/rownames
-        lm['col_attrs'].attrs['CellID'] = 'obs_names'
-        lm['row_attrs'].attrs['Gene'] = 'var_names'
-
-        # Create necessary groups
-        lm.create_group('/attr')
-
-        # Write /obsm
-        for k in adata.obsm.keys():
-            arr = adata.obsm[k]
-            arr_name = k.replace('X_', '').upper()
-            # Derive paths
-            anndata_path = '/obsm/{}'.format(k)
-            loom_path = '/attr/reducedDims__{}'.format(arr_name)
-            sce_path = '@reducedDims@listData${}'.format(arr_name)
-            # Record mapping
-            manifest['loom'].append(loom_path)
-            manifest['dtype'].append('array')
-            manifest['anndata'].append(anndata_path)
-            manifest['sce'].append(sce_path)
-            # Write content to loom
-            lm.create_dataset(loom_path, data=arr)
-
-        # Write /varm
-        for k in adata.varm.keys():
-            arr = adata.varm[k]
-            # Derive paths
-            anndata_path = '/varm/{}'.format(k)
-            loom_path = '/attr/varm__{}'.format(k)
-            # Record mapping
-            manifest['loom'].append(loom_path)
-            manifest['dtype'].append('array')
-            manifest['anndata'].append(anndata_path)
-            manifest['sce'].append('')
-            # Write content to loom
-            lm.create_dataset(loom_path, data=arr)
-
-        # Write /uns
-        uns_entries = []
-        for k in adata.uns.keys():
-            item = adata.uns[k]
-            # Special handling of 'neighbors' as it contains graphs
-            if k in col_graphs:
-                # Write content to loom while recording loom path and dtype
-                _h5_write_recursive_dictionary(
-                    item,
-                    k,
-                    uns_entries,
-                    attr_root=lm,
-                    dataset_root=lm['/attr'],
-                    graph_root=lm['/col_graphs'],
-                )
-            else:
-                # Write content to loom while recording loom path and dtype
-                _h5_write_recursive_dictionary(
-                    item,
-                    k,
-                    uns_entries,
-                    attr_root=lm,
-                    dataset_root=lm['/attr'],
-                    graph_root=lm['/attr'],
-                )
-            # Derive paths
-            for entry in uns_entries:
-                loom_path, _, dtype = entry.partition('::')
-                if loom_path.startswith('/col_graphs/'):
-                    path = loom_path[12:]
-                    sce_path = '@colGraphs${}'.format(path)
-                else:
-                    if loom_path.startswith('/.attrs['):
-                        path = loom_path[8:-1] # remove '/.attrs['
-                    elif loom_path.startswith('/attr/'):
-                        path = loom_path[6:] # remove '/attr/'
-                    else:
-                        logging.warning('unexpected path: {}'.format(loom_path))
-                    sce_path = '@metadata${}'.format(path.replace('__', '$'))
-                anndata_path = '/uns/{}'.format(path.replace('__', '/'))
-                # Record mapping
-                manifest['loom'].append(loom_path)
-                manifest['dtype'].append(dtype)
-                manifest['anndata'].append(anndata_path)
-                manifest['sce'].append(sce_path)
-
-        # Write /raw
-        if adata.raw is not None:
-            _h5_write_csr_matrix(lm['/attr'], 'raw.X', adata.raw.X)
-            manifest['loom'].append('/attr/raw.X')
-            manifest['dtype'].append('csr_matrix')
-            manifest['anndata'].append('/raw.X')
-            manifest['sce'].append('@metadata$raw.X')
-            lm['/attr'].create_dataset(
-                'raw.var', data=adata.raw.var.index.values.astype(bytes))
-            manifest['loom'].append('/attr/raw.var')
-            manifest['dtype'].append('array')
-            manifest['anndata'].append('/raw.var')
-            manifest['sce'].append('@metadata$raw.var')
-
-        # Write mapping
-        lm['/attr'].create_dataset(
-            'manifest', data=pd.DataFrame(manifest).values.astype(np.character))
diff --git a/scanpy_scripts/lib/_leiden.py b/scanpy_scripts/lib/_leiden.py
index 2d6844c1..8ca18eb4 100644
--- a/scanpy_scripts/lib/_leiden.py
+++ b/scanpy_scripts/lib/_leiden.py
@@ -9,7 +9,8 @@
 def leiden(
         adata,
         resolution,
-        use_graph=None,
+        neighbors_key=None,
+        obsp=None,
         key_added=None,
         export_cluster=None,
         **kwargs
@@ -20,11 +21,7 @@ def leiden(
     keys = []
     if kwargs.get('restrict_to', None) and not kwargs['restrict_to'][0]:
         kwargs['restrict_to'] = None
-    adj_mat = None
-    if use_graph:
-        if use_graph not in adata.uns:
-            raise KeyError(f'"{use_graph}" is not a valid key of `.uns`.')
-        adj_mat = adata.uns[use_graph]['connectivities']
+    
     if not isinstance(resolution, (list, tuple)):
         if key_added is not None and not key_added.startswith('leiden_'):
             key_added = f'leiden_{key_added}'
@@ -33,7 +30,8 @@ def leiden(
         sc.tl.leiden(
             adata,
             resolution=resolution,
-            adjacency=adj_mat,
+            neighbors_key=neighbors_key,
+            obsp=obsp,
             key_added=key_added,
             **kwargs
         )
@@ -42,7 +40,7 @@ def leiden(
         for i, res in enumerate(resolution):
             res_key = str(res).replace('.', '_')
             if key_added is None:
-                graph_key = ('_' + use_graph) if use_graph else ''
+                graph_key = ('_' + f'{neighbors_key or obsp}') if neighbors or obsp else ''
                 key = f'leiden{graph_key}_r{res_key}'
             elif not isinstance(key_added, (list, tuple)):
                 key = f'leiden_{key_added}_r{res_key}'
@@ -54,7 +52,8 @@ def leiden(
             keys.extend(leiden(
                 adata,
                 resolution=res,
-                use_graph=use_graph,
+                neighbors_key=neighbors_key,
+                obsp=obsp,
                 key_added=key,
                 **kwargs,
             ))

From feabc73344001fdec2d2f54af9f60d15384ee894 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Thu, 6 Aug 2020 16:55:44 +0100
Subject: [PATCH 10/29] Add layer specification to rank_genes_groups call

---
 scanpy_scripts/cmd_options.py  |  7 +++++++
 scanpy_scripts/lib/_diffexp.py | 18 ++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index d2daa9ff..91d2e4b9 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1072,6 +1072,13 @@
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
         COMMON_OPTIONS['use_raw'],
+        COMMON_OPTIONS['key_added'],
+        click.option(
+            '--layer', '-l',
+            type=click.STRING,
+            default=None,
+            help="Key from adata.layers whose value will be used to perform tests on."
+        ),
         click.option(
             '--groupby', '-g',
             type=click.STRING,
diff --git a/scanpy_scripts/lib/_diffexp.py b/scanpy_scripts/lib/_diffexp.py
index f5f88d2c..5f2518bd 100644
--- a/scanpy_scripts/lib/_diffexp.py
+++ b/scanpy_scripts/lib/_diffexp.py
@@ -10,7 +10,8 @@ def diffexp(
         adata,
         use_raw=True,
         n_genes=None,
-        key_added=None,
+        key_added='rank_genes_groups',
+        layer=None,
         logreg_param=None,
         filter_params=None,
         save=None,
@@ -29,23 +30,24 @@ def diffexp(
         for key, val in logreg_param:
             kwargs[key] = val
 
-    sc.tl.rank_genes_groups(
-        adata, use_raw=use_raw, n_genes=n_genes, key_added=key_added, **kwargs)
-
     key_added = key_added if key_added else 'rank_genes_groups'
+    diff_key = (key_added + f'_{layer}') if layer else key_added
+
+    sc.tl.rank_genes_groups(
+        adata, use_raw=use_raw, n_genes=n_genes, key_added=diff_key, **kwargs)
 
-    de_tbl = extract_de_table(adata.uns[key_added])
+    de_tbl = extract_de_table(adata.uns[diff_key])
 
     if isinstance(filter_params, dict):
         sc.tl.filter_rank_genes_groups(
             adata,
-            key=key_added,
-            key_added=key_added + '_filtered',
+            key=diff_key,
+            key_added=diff_key + '_filtered',
             use_raw=use_raw,
             **filter_params,
         )
 
-        de_tbl = extract_de_table(adata.uns[key_added + '_filtered'])
+        de_tbl = extract_de_table(adata.uns[diff_key + '_filtered'])
         de_tbl = de_tbl.loc[de_tbl.genes.astype(str) != 'nan', :]
 
     if save:

From 8a34634fcc919441d917797dff93d3ad8ff0029a Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Thu, 6 Aug 2020 17:15:13 +0100
Subject: [PATCH 11/29] Updata paga call to allow RNA velocity, use
 Scanpy-native graph slot specification

---
 scanpy-scripts-tests.bats     |  4 ++--
 scanpy_scripts/cmd_options.py | 12 +++++++++++-
 scanpy_scripts/lib/_paga.py   |  6 ------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 1960e887..e20f4fb5 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -45,7 +45,7 @@ setup() {
     diffexp_tsv="${output_dir}/diffexp.tsv"
     diffexp_opt="-g leiden_k10_r0_7 --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv} -f loom"
     diffexp_obj="${output_dir}/diffexp.h5ad"
-    paga_opt="--use-graph neighbors_k10 --key-added k10_r0_7 --groups leiden_k10_r0_7 --model v1.2 -f loom"
+    paga_opt="--neighbors-key neighbors_k10 --key-added louvain_k10_r0_5 --groups louvain_k10_r0_5 --model v1.2"
     paga_obj="${output_dir}/paga.h5ad"
     diffmap_embed="${output_dir}/diffmap.tsv"
     diffmap_opt="--use-graph neighbors_k10 --n-comps 10 -E ${diffmap_embed}"
@@ -279,7 +279,7 @@ setup() {
         skip "$paga_obj exists and resume is set to 'true'"
     fi
 
-    run rm -f $paga_obj && eval "$scanpy paga $paga_opt $leiden_obj $paga_obj"
+    run rm -f $paga_obj && eval "$scanpy paga $paga_opt $louvain_obj $paga_obj"
 
     [ "$status" -eq 0 ]
     [ -f  "$paga_obj" ]
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index 91d2e4b9..1247bcbe 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1166,7 +1166,7 @@
     'paga': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
-        COMMON_OPTIONS['knn_graph'][0], # --use-graph
+        COMMON_OPTIONS['knn_graph'][0], # --neighbors-key
         COMMON_OPTIONS['key_added'],
         click.option(
             '--groups',
@@ -1182,6 +1182,16 @@
             show_default=True,
             help='The PAGA connectivity model.',
         ),
+        click.option(
+            '--use-rna-velocity',
+            is_flag=True,
+            default=False,
+            show_default=True,
+            help='Use RNA velocity to orient edges in the abstracted graph and '
+            'estimate transitions. Requires that adata.uns contains a directed single-cell '
+            'graph with key velocity_graph. This feature might be subject to change in the '
+            'future.',
+        ),
     ],
 
     'diffmap': [
diff --git a/scanpy_scripts/lib/_paga.py b/scanpy_scripts/lib/_paga.py
index 6e49064b..6d320ce0 100644
--- a/scanpy_scripts/lib/_paga.py
+++ b/scanpy_scripts/lib/_paga.py
@@ -15,20 +15,14 @@
 
 def paga(
         adata,
-        use_graph='neighbors',
         key_added=None,
         **kwargs,
 ):
     """
     Wrapper function for sc.tl.paga, for supporting named slot
     """
-    _set_default_key(adata.uns, 'neighbors', use_graph)
-    _backup_default_key(adata.uns, 'paga')
-
     sc.tl.paga(adata, **kwargs)
 
-    _restore_default_key(adata.uns, 'neighbors', use_graph)
-
     if key_added:
         paga_key = f'paga_{key_added}'
         _rename_default_key(adata.uns, 'paga', paga_key)

From a161abf071e371d9fc46b6e628caf1e7889e7094 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Fri, 7 Aug 2020 12:09:30 +0100
Subject: [PATCH 12/29] Set diffmap, dpt to use neighbors_key, add other
 missing option.

---
 scanpy-scripts-tests.bats      |  4 ++--
 scanpy_scripts/cmd_options.py  | 13 +++++++++++--
 scanpy_scripts/lib/_diffmap.py | 14 +-------------
 scanpy_scripts/lib/_dpt.py     | 21 ++-------------------
 4 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index e20f4fb5..c3fea05b 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -48,9 +48,9 @@ setup() {
     paga_opt="--neighbors-key neighbors_k10 --key-added louvain_k10_r0_5 --groups louvain_k10_r0_5 --model v1.2"
     paga_obj="${output_dir}/paga.h5ad"
     diffmap_embed="${output_dir}/diffmap.tsv"
-    diffmap_opt="--use-graph neighbors_k10 --n-comps 10 -E ${diffmap_embed}"
+    diffmap_opt="--neighbors-key neighbors_k10 --n-comps 10 -E ${diffmap_embed}"
     diffmap_obj="${output_dir}/diffmap.h5ad"
-    dpt_opt="--use-graph neighbors_k10 --key-added k10 --n-dcs 10 --root leiden_k10_r0_7 0"
+    dpt_opt="--neighbors-key neighbors_k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root louvain_k10_r0_5 0"
     dpt_obj="${output_dir}/dpt.h5ad"
     plt_embed_opt="--color leiden_k10_r0_7 -f loom --title test"
     plt_embed_pdf="${output_dir}/umap_leiden_k10_r0_7.pdf"
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index 1247bcbe..7ea21508 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1197,7 +1197,7 @@
     'diffmap': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
-        COMMON_OPTIONS['knn_graph'][0], # --use-graph
+        COMMON_OPTIONS['knn_graph'][0], # --neighbors-key
         COMMON_OPTIONS['key_added'],
         COMMON_OPTIONS['export_embedding'],
         COMMON_OPTIONS['n_comps'],
@@ -1206,7 +1206,7 @@
     'dpt': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
-        COMMON_OPTIONS['knn_graph'][0], # --use-graph
+        COMMON_OPTIONS['knn_graph'][0], # --neighbors-key
         COMMON_OPTIONS['key_added'],
         click.option(
             '--root',
@@ -1239,6 +1239,15 @@
             'do not consider branches/groups that contain fewer than this fraction '
             'of the total number of data points.',
         ),
+        click.option(
+            '--disallow-kendall-tau-shift', 'allow_kendall_tau_shift',
+            is_flag=True,
+            default=True,
+            show_default=True,
+            help='By default: If a very small branch is detected upon '
+            'splitting, shift away from maximum correlation in Kendall tau criterion of '
+            '[Haghverdi16] to stabilize the splitting. Use flag to disable this.'
+        ),
     ],
 
     'embed': [
diff --git a/scanpy_scripts/lib/_diffmap.py b/scanpy_scripts/lib/_diffmap.py
index d52333fc..e3687058 100644
--- a/scanpy_scripts/lib/_diffmap.py
+++ b/scanpy_scripts/lib/_diffmap.py
@@ -4,10 +4,6 @@
 
 import scanpy as sc
 from ..obj_utils import (
-    _set_default_key,
-    _restore_default_key,
-    _backup_obsm_key,
-    _delete_obsm_backup_key,
     _rename_obsm_key,
     write_embedding,
 )
@@ -15,7 +11,6 @@
 
 def diffmap(
         adata,
-        use_graph='neighbors',
         key_added=None,
         export_embedding=None,
         **kwargs,
@@ -23,19 +18,12 @@ def diffmap(
     """
     Wrapper function for sc.tl.diffmap, for supporting named slot
     """
-    _set_default_key(adata.uns, 'neighbors', use_graph)
-    _backup_obsm_key(adata, 'X_diffmap')
-
     sc.tl.diffmap(adata, **kwargs)
-
-    _restore_default_key(adata.uns, 'neighbors', use_graph)
-
+    
     diffmap_key = 'X_diffmap'
     if key_added:
         diffmap_key = f'{diffmap_key}_{key_added}'
         _rename_obsm_key(adata, 'X_diffmap', diffmap_key)
-    else:
-        _delete_obsm_backup_key(adata, 'X_diffmap')
 
     if export_embedding is not None:
         write_embedding(adata, diffmap_key, export_embedding, key_added=key_added)
diff --git a/scanpy_scripts/lib/_dpt.py b/scanpy_scripts/lib/_dpt.py
index a5414bc8..98956d19 100644
--- a/scanpy_scripts/lib/_dpt.py
+++ b/scanpy_scripts/lib/_dpt.py
@@ -5,26 +5,19 @@
 import numpy as np
 import scanpy as sc
 from ..obj_utils import (
-    _set_default_key,
-    _restore_default_key,
-    _backup_default_key,
-    _delete_backup_key,
     _rename_default_key,
-    _set_obsm_key,
-    _restore_obsm_key,
 )
 
 
 def dpt(
         adata,
         root=None,
-        use_graph='neighbors',
         use_diffmap='X_diffmap',
         key_added=None,
         **kwargs,
 ):
     """
-    Wrapper function for sc.tl.dpt, for support named slot
+    Wrapper function for sc.tl.dpt
     """
     if root is None or not (isinstance(root, (list, tuple)) and len(root) == 2):
         root = (None, None)
@@ -35,19 +28,9 @@ def dpt(
         adata.uns['iroot'] = np.random.choice(
             np.flatnonzero(adata.obs[root[0]] == root[1]))
 
-    _set_default_key(adata.uns, 'neighbors', use_graph)
-    _set_obsm_key(adata, 'X_diffmap', use_diffmap)
-
-    _backup_default_key(adata.obs, 'dpt_pseudotime')
-
     sc.tl.dpt(adata, **kwargs)
-
-    _restore_default_key(adata.uns, 'neighbors', use_graph)
-    _restore_obsm_key(adata, 'X_diffmap', use_diffmap)
-
     if key_added:
         dpt_key = f'dpt_pseudotime_{key_added}'
         _rename_default_key(adata.obs, 'dpt_pseudotime', dpt_key)
-    else:
-        _delete_backup_key(adata.obs, 'dpt_pseudotime')
+    
     return adata

From 129483b35f2ed82d17b1b6f043887b3bc720e71a Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Mon, 10 Aug 2020 15:51:24 +0100
Subject: [PATCH 13/29] Fixes for embeddings, plot_paga to use new
 sc.pl.embeddings.

---
 scanpy-scripts-tests.bats     | 10 ++--
 scanpy_scripts/cmd_options.py | 99 +++++++++++++++++++++++++++++------
 scanpy_scripts/cmd_utils.py   |  3 +-
 scanpy_scripts/cmds.py        |  2 +-
 scanpy_scripts/lib/_paga.py   |  2 +-
 5 files changed, 91 insertions(+), 25 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index c3fea05b..60fb1d16 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -52,9 +52,9 @@ setup() {
     diffmap_obj="${output_dir}/diffmap.h5ad"
     dpt_opt="--neighbors-key neighbors_k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root louvain_k10_r0_5 0"
     dpt_obj="${output_dir}/dpt.h5ad"
-    plt_embed_opt="--color leiden_k10_r0_7 -f loom --title test"
-    plt_embed_pdf="${output_dir}/umap_leiden_k10_r0_7.pdf"
-    plt_paga_opt="--use-key paga_k10_r0_7 --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff"
+    plt_embed_opt="--projection 2d --color louvain_k10_r0_5 --title test"
+    plt_embed_pdf="${output_dir}/umap_louvain_k10_r0_5.pdf"
+    plt_paga_opt="--use-key paga_louvain_k10_r0_5 --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff"
     plt_paga_pdf="${output_dir}/paga_k10_r0_7.pdf"
     test_clustering='leiden_k10_r0_3'
     test_markers='LDHB,CD3D,CD3E'
@@ -318,10 +318,10 @@ setup() {
         skip "$plt_embed_pdf exists and resume is set to 'true'"
     fi
 
-    run rm -f $plt_embed_pdf && eval "$scanpy plot embed $plt_embed_opt $leiden_obj $plt_embed_pdf"
+    run rm -f $plt_embed_pdf && eval "$scanpy plot embed $plt_embed_opt $louvain_obj $plt_embed_pdf"
 
     [ "$status" -eq 0 ]
-    [ -f  "$dpt_obj" ]
+    [ -f  "$plt_embed_pdf" ]
 }
 
 # Run Plot paga
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index 7ea21508..36e8276e 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -177,6 +177,18 @@
         ),
     ],
 
+    
+    'layer':click.option(
+        '--layer',
+        type=CommaSeparatedText(simplify=True),
+        default=None,
+        show_default=True,
+        help='Name of the AnnData object layer that wants to be plotted. By '
+        'default adata.raw.X is plotted. If use_raw=False is set, then adata.X '
+        'is plotted. If layer is set to a valid layer name, then the layer is '
+        'plotted. layer takes precedence over use_raw.',
+    ),
+
     'n_comps': click.option(
         '--n-comps',
         type=click.INT,
@@ -310,16 +322,6 @@
             'computed using scanpy.tl.dendrogram(). If tl.dendrogram has not been '
             'called previously the function is called with default parameters.',
         ),
-        click.option(
-            '--layer',
-            type=CommaSeparatedText(simplify=True),
-            default=None,
-            show_default=True,
-            help='Name of the AnnData object layer that wants to be plotted. By '
-            'default adata.raw.X is plotted. If use_raw=False is set, then adata.X '
-            'is plotted. If layer is set to a valid layer name, then the layer is '
-            'plotted. layer takes precedence over use_raw.',
-        ),
         click.option(
             '--standard-scale',
             type=click.Choice(['var', 'obs']),
@@ -485,6 +487,23 @@
         show_default=True,
         help='If choosing a tree layout, this is the index of the root node.',
     ),
+
+    'plot_embed': [
+        click.option(
+            '--use-raw/--no-raw',
+            default=None,
+            show_default=True,
+            help='Use `.raw` attribute for coloring with gene expression. If '
+            '`None`, uses `.raw` if present.',
+        ),
+        click.option(
+            '--groups',
+            type=click.STRING,
+            default=None,
+            help='Key for categorical in `.obs`. You can pass your predefined '
+            'groups by choosing any categorical annotation of observations.',
+        ),
+    ]
 }
 
 CMD_OPTIONS = {
@@ -1254,6 +1273,7 @@
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['plot'],
         *COMMON_OPTIONS['frame_title'],
+        COMMON_OPTIONS['layer'],
         click.option(
             '--basis',
             type=click.STRING,
@@ -1269,13 +1289,6 @@
             show_default=True,
             help='Keys for annotations of observations/cells or variables/genes.',
         ),
-        click.option(
-            '--use-raw/--no-raw',
-            default=None,
-            show_default=True,
-            help='Use `.raw` attribute for coloring with gene expression. If '
-            '`None`, uses `.raw` if present.',
-        ),
         click.option(
             '--legend-loc',
             type=click.Choice(['right margin', 'on data']),
@@ -1299,12 +1312,60 @@
             help='Point size. Automatically computed if not specified.',
         ),
         COMMON_OPTIONS['gene_symbols'],
+        click.option(
+            '--edges',
+            is_flag=True,
+            default=False,
+            show_default=True,
+            help='Show edges.',
+        ),
+        click.option(
+            '--edges-width',
+            type=click.FLOAT,
+            default=0.1,
+            show_default=True,
+            help='Width of edges.',
+        ),
+        click.option(
+            '--edges-color',
+            type=click.STRING,
+            default=None,
+            show_default=True,
+            help='Color of edges. See draw_networkx_edges().',
+        ),
+        COMMON_OPTIONS['knn_graph'][0], # --neighbors-key
+        click.option(
+            '--no-sort-order', 'sort_order',
+            is_flag=True,
+            default=True,
+            show_default=True,
+            help='Disable default behaviour: for continuous annotations used as '
+            'color parameter, plot data points with higher values on top of others.',
+        ),
+        *COMMON_OPTIONS['plot_embed'],
+        click.option(
+            '--components',
+            type=click.STRING,
+            default=None,
+            show_default=True,
+            help="For instance, ['1,2', '2,3']. To plot all available components use 'all'.",
+        ),
+        click.option(
+            '--projection',
+            type=click.Choice(['2d', '3d']),
+            default='2d',
+            show_default=True,
+            help="Projection of plot."
+        ),
+         
     ],
 
     'plot_paga': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['plot'],
         *COMMON_OPTIONS['frame_title'],
+        *COMMON_OPTIONS['plot_embed'],
+        COMMON_OPTIONS['random_state'],
         click.option(
             '--use-key',
             type=click.STRING,
@@ -1431,6 +1492,7 @@
         COMMON_OPTIONS['use_raw'],
         COMMON_OPTIONS['var_names'],
         *COMMON_OPTIONS['rank_genes_groups_plots'],
+        COMMON_OPTIONS['layer'],
         *COMMON_OPTIONS['diffexp_plot'],
         COMMON_OPTIONS['gene_symbols'],
         *COMMON_OPTIONS['sviol'],
@@ -1443,6 +1505,7 @@
         COMMON_OPTIONS['use_raw'],
         COMMON_OPTIONS['var_names'],
         *COMMON_OPTIONS['rank_genes_groups_plots'],
+        COMMON_OPTIONS['layer'],
         *COMMON_OPTIONS['diffexp_plot'],
         COMMON_OPTIONS['gene_symbols'],
         *COMMON_OPTIONS['dot'],
@@ -1454,6 +1517,7 @@
         COMMON_OPTIONS['use_raw'],
         COMMON_OPTIONS['var_names'],
         *COMMON_OPTIONS['rank_genes_groups_plots'],
+        COMMON_OPTIONS['layer'],
         *COMMON_OPTIONS['diffexp_plot'],
         COMMON_OPTIONS['gene_symbols'],
     ],
@@ -1464,6 +1528,7 @@
         COMMON_OPTIONS['use_raw'],
         COMMON_OPTIONS['var_names'],
         *COMMON_OPTIONS['rank_genes_groups_plots'],
+        COMMON_OPTIONS['layer'],
         *COMMON_OPTIONS['diffexp_plot'],
         COMMON_OPTIONS['gene_symbols'],
         *COMMON_OPTIONS['heat'],
diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py
index 81e691b8..8f7a5665 100644
--- a/scanpy_scripts/cmd_utils.py
+++ b/scanpy_scripts/cmd_utils.py
@@ -161,6 +161,7 @@ def make_plot_function(func_name, kind=None):
     # Provide a function translation
 
     plot_funcs = {
+        'embedding': sc.pl.embedding,
         'scatter': sc.pl.scatter,
         'sviol': sc.pl.stacked_violin,
         'rgg_sviol': sc.pl.rank_genes_groups_stacked_violin,
@@ -231,7 +232,7 @@ def plot_function(
 
         if output_fig:
             prefix=''
-            if func_name == 'scatter':
+            if func_name == 'scatter' or func_name == 'embedding':
                 prefix =  kwargs.get('basis', func.__name__)
             elif kind:
                 prefix = kind
diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py
index 50887223..e697e69c 100644
--- a/scanpy_scripts/cmds.py
+++ b/scanpy_scripts/cmds.py
@@ -170,7 +170,7 @@
 
 PLOT_EMBED_CMD = make_subcmd(
     'embed',
-    make_plot_function('scatter'),
+    make_plot_function('embedding'),
     cmd_desc='Plot cell embeddings.',
     arg_desc=_IP_DESC,
 )
diff --git a/scanpy_scripts/lib/_paga.py b/scanpy_scripts/lib/_paga.py
index 6d320ce0..58bb84e9 100644
--- a/scanpy_scripts/lib/_paga.py
+++ b/scanpy_scripts/lib/_paga.py
@@ -49,7 +49,7 @@ def plot_paga(
     """Make PAGA plot
     """
     if basis is not None and f'X_{basis}' in adata.obsm.keys():
-        ax = sc.plotting._tools.scatterplots.plot_scatter(
+        ax = sc.pl.embedding(
             adata,
             basis=basis,
             color=color,

From 1d2ee553124aa16c9963c34ba34fd029cc82ffd5 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Mon, 10 Aug 2020 16:27:14 +0100
Subject: [PATCH 14/29] Fix plotting test params

---
 scanpy-scripts-tests.bats | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 60fb1d16..3f19bef0 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -42,21 +42,21 @@ setup() {
     leiden_tsv="${output_dir}/leiden.tsv"
     leiden_opt="-r 0.3,0.7 --neighbors-key neighbors_k10 --key-added k10 -F loom --loom-write-obsm-varm --export-cluster ${leiden_tsv}"
     leiden_obj="${output_dir}/leiden.loom"
+    test_clustering='louvain_k10_r0_5'
     diffexp_tsv="${output_dir}/diffexp.tsv"
-    diffexp_opt="-g leiden_k10_r0_7 --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv} -f loom"
+    diffexp_opt="-g ${test_clustering} --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv}"
     diffexp_obj="${output_dir}/diffexp.h5ad"
-    paga_opt="--neighbors-key neighbors_k10 --key-added louvain_k10_r0_5 --groups louvain_k10_r0_5 --model v1.2"
+    paga_opt="--neighbors-key neighbors_k10 --key-added ${test_clustering} --groups ${test_clustering} --model v1.2"
     paga_obj="${output_dir}/paga.h5ad"
     diffmap_embed="${output_dir}/diffmap.tsv"
     diffmap_opt="--neighbors-key neighbors_k10 --n-comps 10 -E ${diffmap_embed}"
     diffmap_obj="${output_dir}/diffmap.h5ad"
-    dpt_opt="--neighbors-key neighbors_k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root louvain_k10_r0_5 0"
+    dpt_opt="--neighbors-key neighbors_k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root ${test_clustering} 0"
     dpt_obj="${output_dir}/dpt.h5ad"
-    plt_embed_opt="--projection 2d --color louvain_k10_r0_5 --title test"
-    plt_embed_pdf="${output_dir}/umap_louvain_k10_r0_5.pdf"
-    plt_paga_opt="--use-key paga_louvain_k10_r0_5 --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff"
+    plt_embed_opt="--projection 2d --color ${test_clustering} --title test"
+    plt_embed_pdf="${output_dir}/umap_${test_clustering}.pdf"
+    plt_paga_opt="--use-key paga_${test_clustering} --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff"
     plt_paga_pdf="${output_dir}/paga_k10_r0_7.pdf"
-    test_clustering='leiden_k10_r0_3'
     test_markers='LDHB,CD3D,CD3E'
     diffexp_plot_opt="--var-names $test_markers --use-raw --dendrogram --groupby ${test_clustering}"
     plt_stacked_violin_opt="${diffexp_plot_opt} --no-jitter --swap-axes"
@@ -64,7 +64,7 @@ setup() {
     plt_dotplot_pdf="${output_dir}/dot_${test_clustering}_LDHB_CD3D_CD3E.pdf"
     plt_matrixplot_pdf="${output_dir}/matrix_${test_clustering}_LDHB_CD3D_CD3E.pdf"
     plt_heatmap_pdf="${output_dir}/heatmap_${test_clustering}_LDHB_CD3D_CD3E.pdf"
-    plt_rank_genes_groups_opt="--rgg --groups 3,5"
+    plt_rank_genes_groups_opt="--rgg --groups 3,4"
     plt_rank_genes_groups_stacked_violin_pdf="${output_dir}/rggsviolin_${test_clustering}.pdf"
     plt_rank_genes_groups_matrix_pdf="${output_dir}/rggmatrix_${test_clustering}.pdf"
     plt_rank_genes_groups_dot_pdf="${output_dir}/rggdot_${test_clustering}.pdf"
@@ -266,7 +266,7 @@ setup() {
         skip "$diffexp_obj exists and resume is set to 'true'"
     fi
 
-    run rm -f $diffexp_obj $diffexp_tsv && eval "$scanpy diffexp $diffexp_opt $leiden_obj $diffexp_obj"
+    run rm -f $diffexp_obj $diffexp_tsv && eval "$scanpy diffexp $diffexp_opt $louvain_obj $diffexp_obj"
 
     [ "$status" -eq 0 ]
     [ -f  "$diffexp_obj" ] && [ -f "$diffexp_tsv" ]

From 4b738de6db2e01b3696fb638242fca33de23bbf7 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Mon, 10 Aug 2020 17:43:31 +0100
Subject: [PATCH 15/29] Some plot files now have underscores

---
 scanpy_scripts/cmd_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py
index 8f7a5665..a2449e07 100644
--- a/scanpy_scripts/cmd_utils.py
+++ b/scanpy_scripts/cmd_utils.py
@@ -235,9 +235,11 @@ def plot_function(
             if func_name == 'scatter' or func_name == 'embedding':
                 prefix =  kwargs.get('basis', func.__name__)
             elif kind:
-                prefix = kind
+                prefix = kind 
             elif func_name in plot_funcs:
                 prefix = plot_funcs[ func_name ].__name__.split('.')[-1]
+                if func_name in [ 'sviol', 'rgg_sviol', 'dot', 'rgg_dot', 'matrix', 'rgg_matrix' ]:
+                    prefix = prefix + '_'
 
             os.rename(
                 os.path.join(sc.settings.figdir, prefix + figname), output_fig)

From 85826e0078ed92b7884aeb3b77f68000230010b1 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 11 Aug 2020 11:10:55 +0100
Subject: [PATCH 16/29] Add wrapper for harmony_integrate

---
 scanpy-scripts-tests.bats     | 46 +++++++++++++++++++++++++++++++++++
 scanpy_scripts/cli.py         |  8 ++++++
 scanpy_scripts/cmd_options.py | 29 ++++++++++++++++++++++
 scanpy_scripts/cmds.py        |  8 ++++++
 4 files changed, 91 insertions(+)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 3f19bef0..08333678 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -69,6 +69,12 @@ setup() {
     plt_rank_genes_groups_matrix_pdf="${output_dir}/rggmatrix_${test_clustering}.pdf"
     plt_rank_genes_groups_dot_pdf="${output_dir}/rggdot_${test_clustering}.pdf"
     plt_rank_genes_groups_heatmap_pdf="${output_dir}/rggheatmap_${test_clustering}.pdf"
+    harmony_integrate_obj="${output_dir}/harmony_integrate.h5ad"
+    harmony_integrate_opt="--batch-key ${test_clustering}"
+    harmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings after harmony' --basis 'X_pca_harmony'"
+    noharmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings before harmony' --basis 'X_pca'"
+    harmony_integrated_pca_pdf="${output_dir}/harmony_pca_${test_clustering}.pdf"
+    noharmony_integrated_pca_pdf="${output_dir}/pca_${test_clustering}.pdf"
 
     if [ ! -d "$data_dir" ]; then
         mkdir -p $data_dir
@@ -442,6 +448,46 @@ setup() {
     [ -f  "$plt_rank_genes_groups_matrix_pdf" ]
 }
 
+# Do harmony batch correction, using clustering as batch (just for test purposes)
+
+@test "Run Harmony batch integration using clustering as batch" {
+    if [ "$resume" = 'true' ] && [ -f "$harmony_integrate_obj" ]; then
+        skip "$harmony_integrate_obj exists and resume is set to 'true'"
+    fi
+
+    run rm -f $harmony_integrate_obj && eval "$scanpy integrate harmony_integrate $harmony_integrate_opt $louvain_obj $harmony_integrate_obj"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$plt_rank_genes_groups_matrix_pdf" ]
+
+}
+
+# Run Plot PCA embedding before harmony
+
+@test "Run Plot PCA embedding before Harmony" {
+    if [ "$resume" = 'true' ] && [ -f "$noharmony_integrated_pca_pdf" ]; then
+        skip "$noharmony_integrated_pca_pdf exists and resume is set to 'true'"
+    fi
+
+    run rm -f $noharmony_integrated_pca_pdf && eval "$scanpy plot embed $noharmony_plt_embed_opt $louvain_obj $noharmony_integrated_pca_pdf"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$noharmony_integrated_pca_pdf" ]
+}
+
+# Run Plot PCA embedding after harmony
+
+@test "Run Plot PCA embedding after Harmony" {
+    if [ "$resume" = 'true' ] && [ -f "$harmony_integrated_pca_pdf" ]; then
+        skip "$harmony_integrated_pca_pdf exists and resume is set to 'true'"
+    fi
+
+    run rm -f $harmony_integrated_pca_pdf && eval "$scanpy plot embed $harmony_plt_embed_opt $harmony_integrate_obj $harmony_integrated_pca_pdf"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$harmony_integrated_pca_pdf" ]
+}
+
 # Local Variables:
 # mode: sh
 # End:
diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py
index dc7a07d7..248aa59a 100755
--- a/scanpy_scripts/cli.py
+++ b/scanpy_scripts/cli.py
@@ -30,6 +30,7 @@
     PLOT_DOT_CMD,
     PLOT_MATRIX_CMD,
     PLOT_HEATMAP_CMD,
+    HARMONY_INTEGRATE_CMD,
 )
 
 
@@ -101,6 +102,13 @@ def cluster():
 cli.add_command(DPT_CMD)
 
 
+@cli.group(cls=NaturalOrderGroup)
+def integrate():
+    """Integrate cells from different experimental batches."""
+
+integrate.add_command(HARMONY_INTEGRATE_CMD)
+
+
 @cli.group(cls=NaturalOrderGroup)
 def plot():
     """Visualise data."""
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index 36e8276e..ce3a071b 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1269,6 +1269,35 @@
         ),
     ],
 
+    'harmony_integrate': [
+        *COMMON_OPTIONS['input'],
+        *COMMON_OPTIONS['output'],
+        click.option(
+            '--basis',
+            type=click.STRING,
+            default='X_pca',
+            show_default=True,
+            help="The name of the field in adata.obsm where the PCA table is "
+            "stored. Defaults to 'X_pca', which is the default for sc.tl.pca()."
+        ),
+        click.option(
+            '--batch-key', 'key',
+            type=click.STRING,
+            required=True,
+            help='The name of the column in adata.obs that differentiates among '
+            'experiments/batches.'
+        ),
+        click.option(
+            '--adjusted-basis',
+            type=click.STRING,
+            default='X_pca_harmony',
+            show_default=True,
+            help='The name of the field in adata.obsm where the adjusted PCA '
+            'table will be stored after running this function.'
+        ),
+        
+    ],
+
     'embed': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['plot'],
diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py
index e697e69c..614418bf 100644
--- a/scanpy_scripts/cmds.py
+++ b/scanpy_scripts/cmds.py
@@ -5,6 +5,7 @@
 import os
 import sys
 import scanpy as sc
+import scanpy.external as sce
 
 from .cmd_utils import (
     make_subcmd,
@@ -210,3 +211,10 @@
     arg_desc=_IP_DESC,
     opt_set='plot_paga'
 )
+
+HARMONY_INTEGRATE_CMD = make_subcmd(
+    'harmony_integrate',
+    sce.pp.harmony_integrate,
+    cmd_desc='Use harmonypy [Korunsky19] to integrate different experiments.',    
+    arg_desc=_IO_DESC,
+)

From bbe1f524a34e22ef9881ee89aa51c3f777b1a949 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 11 Aug 2020 13:14:03 +0100
Subject: [PATCH 17/29] Add bbknn CLI layer

---
 scanpy-scripts-tests.bats     | 17 +++++++
 scanpy_scripts/cli.py         |  2 +
 scanpy_scripts/cmd_options.py | 94 +++++++++++++++++++++++++++++++++++
 scanpy_scripts/cmds.py        |  8 +++
 scanpy_scripts/lib/_bbknn.py  | 28 +++++++++++
 5 files changed, 149 insertions(+)
 create mode 100644 scanpy_scripts/lib/_bbknn.py

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 08333678..39ee2d7f 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -75,6 +75,9 @@ setup() {
     noharmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings before harmony' --basis 'X_pca'"
     harmony_integrated_pca_pdf="${output_dir}/harmony_pca_${test_clustering}.pdf"
     noharmony_integrated_pca_pdf="${output_dir}/pca_${test_clustering}.pdf"
+    bbknn_obj="${output_dir}/bbknn.h5ad"
+    bbknn_opt="--batch-key ${test_clustering} --key-added bbknn"
+    
 
     if [ ! -d "$data_dir" ]; then
         mkdir -p $data_dir
@@ -488,6 +491,20 @@ setup() {
     [ -f  "$harmony_integrated_pca_pdf" ]
 }
 
+# Do bbknn batch correction, using clustering as batch (just for test purposes)
+
+@test "Run BBKNN batch integration using clustering as batch" {
+    if [ "$resume" = 'true' ] && [ -f "$bbknn_obj" ]; then
+        skip "$bbknn_obj exists and resume is set to 'true'"
+    fi
+
+    run rm -f $bbknn_obj && echo "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" && eval "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$plt_rank_genes_groups_matrix_pdf" ]
+
+}
+
 # Local Variables:
 # mode: sh
 # End:
diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py
index 248aa59a..a49f2f03 100755
--- a/scanpy_scripts/cli.py
+++ b/scanpy_scripts/cli.py
@@ -31,6 +31,7 @@
     PLOT_MATRIX_CMD,
     PLOT_HEATMAP_CMD,
     HARMONY_INTEGRATE_CMD,
+    BBKNN_CMD,
 )
 
 
@@ -107,6 +108,7 @@ def integrate():
     """Integrate cells from different experimental batches."""
 
 integrate.add_command(HARMONY_INTEGRATE_CMD)
+integrate.add_command(BBKNN_CMD)
 
 
 @cli.group(cls=NaturalOrderGroup)
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index ce3a071b..c1aa2298 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1298,6 +1298,100 @@
         
     ],
 
+    'bbknn': [
+        *COMMON_OPTIONS['input'],
+        *COMMON_OPTIONS['output'],
+        COMMON_OPTIONS['key_added'],
+        click.option(
+            '--use-rep', '-u',
+            type=click.STRING,
+            default='X_pca',
+            show_default=True,
+            help='The dimensionality reduction in .obsm to use for neighbour '
+            'detection.'
+        ),
+        COMMON_OPTIONS['use_pc'][0], # --n-pcs
+        click.option(
+            '--batch-key', 'batch_key',
+            type=click.STRING,
+            required=True,
+            help='adata.obs column name discriminating between your batches.'
+        ),
+        click.option(
+            '--no-approx', 'approx',
+            is_flag=True,
+            default=True,
+            help='Default behaviour is to use annoy’s approximate neighbour '
+            'finding. This results in a quicker run time for large datasets while also '
+            'potentially increasing the degree of batch correction. Use this flag to disable '
+            'that behaviour.',
+        ),
+        click.option(
+            '--metric', '-t',
+            type=click.Choice(['angular', 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']),
+            default='angular',
+            show_default=True,
+            help='A known metric’s name.'
+        ),
+        click.option(
+            '--neighbors-within-batch',
+            type=click.INT,
+            default=3,
+            show_default=True,
+            help='How many top neighbours to report for each batch; total '
+            'number of neighbours will be this number times the number of batches.'
+        ),
+        click.option(
+            '--trim',
+            type=click.INT,
+            default=None,
+            show_default=True,
+            help='Trim the neighbours of each cell to these many top '
+            'connectivities. May help with population independence and improve the tidiness '
+            'of clustering. The lower the value the more independent the individual '
+            'populations, at the cost of more conserved batch effect. If None, sets the '
+            'parameter value automatically to 10 times the total number of neighbours for '
+            'each cell. Set to 0 to skip.'
+        ),
+        click.option(
+            '--n-trees',
+            type=click.INT,
+            default=10,
+            show_default=True,
+            help='Only used when approx=True. The number of trees to construct '
+            'in the annoy forest. More trees give higher precision when querying, at the '
+            'cost of increased run time and resource intensity.'
+        ),
+        click.option(
+            '--no-use-faiss', 'use_faiss',
+            is_flag=True,
+            default=True,
+            help='Default behaviour If approx=False and the metric is '
+            '“euclidean”, is to use the faiss package to compute nearest neighbours if '
+            'installed. This improves performance at a minor cost to numerical precision as '
+            'faiss operates on float32. Use this flag to disable that behaviour.'
+        ),
+        click.option(
+            '--set-op-mix-ratio',
+            type=click.FLOAT,
+            default=1,
+            show_default=True,
+            help='UMAP connectivity computation parameter, float between 0 and '
+            '1, controlling the blend between a connectivity matrix formed exclusively from '
+            'mutual nearest neighbour pairs (0) and a union of all observed neighbour '
+            'relationships with the mutual pairs emphasised (1).'
+        ),
+        click.option(
+            '--local-connectivity',
+            type=click.INT,
+            default=1,
+            show_default=True,
+            help='UMAP connectivity computation parameter, how many nearest '
+            'neighbors of each cell are assumed to be fully connected (and given a '
+            'connectivity value of 1)'
+        ),
+    ],
+
     'embed': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['plot'],
diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py
index 614418bf..7c94a503 100644
--- a/scanpy_scripts/cmds.py
+++ b/scanpy_scripts/cmds.py
@@ -26,6 +26,7 @@
 from .lib._paga import paga
 from .lib._diffmap import diffmap
 from .lib._dpt import dpt
+from .lib._bbknn import bbknn
 
 LANG = os.environ.get('LANG', None)
 
@@ -218,3 +219,10 @@
     cmd_desc='Use harmonypy [Korunsky19] to integrate different experiments.',    
     arg_desc=_IO_DESC,
 )
+
+BBKNN_CMD = make_subcmd(
+    'bbknn',
+    bbknn,
+    cmd_desc='Batch balanced kNN [Polanski19].',
+    arg_desc=_IO_DESC,
+)
diff --git a/scanpy_scripts/lib/_bbknn.py b/scanpy_scripts/lib/_bbknn.py
new file mode 100644
index 00000000..4834dae9
--- /dev/null
+++ b/scanpy_scripts/lib/_bbknn.py
@@ -0,0 +1,28 @@
+"""
+scanpy external bbknn
+"""
+
+import scanpy.external as sce
+
+from ..obj_utils import (
+    _backup_default_key,
+    _delete_backup_key,
+    _rename_default_key,
+)
+
+# Wrapper for bbknn allowing use of non-standard slot
+
+def bbknn(adata, key_added=None, **kwargs):
+    """
+    Wrapper function for sce.pp.bbknn(), for supporting non-standard neighbors slot
+    """
+
+    _backup_default_key(adata.uns, 'neighbors')
+    sce.pp.bbknn(adata, **kwargs)    
+
+    if key_added:
+        _rename_default_key(adata.uns, 'neighbors',  f'neighbors_{key_added}')
+    else:
+        _delete_backup_key(adata.uns, 'neighbors')
+    
+    return adata

From 8e6e860929aa943a34437bdfd71882a9264cf234 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 11 Aug 2020 16:41:46 +0100
Subject: [PATCH 18/29] Add batch correction methods to dependencies for
 testing

---
 setup.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ca4e0eef..c94b1ce3 100644
--- a/setup.py
+++ b/setup.py
@@ -47,6 +47,8 @@
         'loompy>=2.0.0,<3.0.0',
         'MulticoreTSNE',
         'Click',
-        'umap-learn<0.4.0'
+        'umap-learn<0.4.0',
+        'harmonypy>=0.0.5',
+        'bbknn>=1.3.12'
     ],
 )

From 2a564e79a83aeed37c8ea333fbf2c9db508f9d86 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 11 Aug 2020 16:42:27 +0100
Subject: [PATCH 19/29] Add more harmonypy options

---
 scanpy_scripts/cmd_options.py | 84 ++++++++++++++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 1 deletion(-)

diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index c1aa2298..e2d6219d 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1295,7 +1295,89 @@
             help='The name of the field in adata.obsm where the adjusted PCA '
             'table will be stored after running this function.'
         ),
-        
+        click.option(
+            '--theta',
+            type=click.FLOAT,
+            default=2,
+            show_default=True,
+            help='Diversity clustering penalty parameter. theta=0 does not encourage any '
+            'diversity. Larger values of theta result in more diverse clusters.'
+        ),
+        click.option(
+            '--lambda', 'lamb',
+            type=click.FLOAT,
+            default=1,
+            show_default=True,
+            help='Ridge regression penalty parameter. Lambda must be strictly '
+            'positive.  Smaller values result in more aggressive correction.'
+        ),
+        click.option(
+            '--sigma',
+            type=click.FLOAT,
+            default=0.1,
+            show_default=True,
+            help='Width of soft kmeans clusters. Sigma scales the distance from '
+            'a cell to cluster centroids. Larger values of sigma result in cells assigned to '
+            'more clusters. Smaller values of sigma make soft kmeans cluster approach hard '
+            'clustering.'
+        ),
+        click.option(
+            '--n-clust', 'nclust',
+            type=click.INT,
+            default=None,
+            show_default=False,
+            help='Number of clusters in model. nclust=1 equivalent to simple '
+            'linear regression.'
+        ),
+        click.option(
+            '--tau',
+            type=click.INT,
+            default=0,
+            show_default=True,
+            help='Protection against overclustering small datasets with large ones. '
+            'tau is the expected number of cells per cluster.'
+        ),
+        click.option(
+            '--block-size',
+            type=click.FLOAT,
+            default=0.05,
+            show_default=True,
+            help='What proportion of cells to update during clustering. Between '
+            '0 to 1, default 0.05. Larger values may be faster but less accurate.'
+        ),
+        click.option(
+            '--max-iter-cluster', 'max_iter_kmeans',
+            type=click.INT,
+            default=20,
+            show_default=True,
+            help='Maximum number of rounds to run clustering at each round of '
+            'Harmony.'
+        ),
+        click.option(
+            '--max-iter-harmony',
+            type=click.INT,
+            default=10,
+            show_default=True,
+            help='Maximum number of rounds to run Harmony. One round of Harmony '
+            'involves one clustering and one correction step.'
+        ),
+        click.option(
+            '--epsilon-cluster',
+            type=click.FLOAT,
+            default=1e-5,
+            show_default=True,
+            help='Convergence tolerance for clustering round of Harmony Set to '
+            '-Inf to never stop early.'
+        ),
+        click.option(
+            '--epsilon-harmony',
+            type=click.FLOAT,
+            default=1e-5,
+            show_default=True,
+            help='Convergence tolerance for clustering round of Harmony Set to '
+            '-Inf to never stop early.'
+        ),
+        COMMON_OPTIONS['random_state'],
     ],
 
     'bbknn': [

From 0faa36948e028e224f0cf190f98342512d8ad790 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Thu, 13 Aug 2020 19:31:34 +0100
Subject: [PATCH 20/29] Added MNN batch correction

---
 scanpy-scripts-tests.bats     |  18 +++++-
 scanpy_scripts/cli.py         |   2 +
 scanpy_scripts/cmd_options.py | 102 ++++++++++++++++++++++++++++++++++
 scanpy_scripts/cmds.py        |   8 +++
 scanpy_scripts/lib/_mnn.py    |  85 ++++++++++++++++++++++++++++
 5 files changed, 214 insertions(+), 1 deletion(-)
 create mode 100644 scanpy_scripts/lib/_mnn.py

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 39ee2d7f..ec12d447 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -77,6 +77,8 @@ setup() {
     noharmony_integrated_pca_pdf="${output_dir}/pca_${test_clustering}.pdf"
     bbknn_obj="${output_dir}/bbknn.h5ad"
     bbknn_opt="--batch-key ${test_clustering} --key-added bbknn"
+    mnn_obj="${output_dir}/mnn.h5ad"
+    mnn_opt="--batch-key ${test_clustering}"
     
 
     if [ ! -d "$data_dir" ]; then
@@ -498,13 +500,27 @@ setup() {
         skip "$bbknn_obj exists and resume is set to 'true'"
     fi
 
-    run rm -f $bbknn_obj && echo "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" && eval "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj"
+    run rm -f $bbknn_obj && eval "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj"
 
     [ "$status" -eq 0 ]
     [ -f  "$plt_rank_genes_groups_matrix_pdf" ]
 
 }
 
+# Do bbknn batch correction, using clustering as batch (just for test purposes)
+
+@test "Run MNN batch integration using clustering as batch" {
+    if [ "$resume" = 'true' ] && [ -f "$mnn_obj" ]; then
+        skip "$mnn_obj exists and resume is set to 'true'"
+    fi
+
+    run rm -f $mnn_obj && eval "$scanpy integrate mnn_correct $mnn_opt $louvain_obj $mnn_obj"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$mnn_obj" ]
+
+}
+
 # Local Variables:
 # mode: sh
 # End:
diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py
index a49f2f03..0fee8b9f 100755
--- a/scanpy_scripts/cli.py
+++ b/scanpy_scripts/cli.py
@@ -32,6 +32,7 @@
     PLOT_HEATMAP_CMD,
     HARMONY_INTEGRATE_CMD,
     BBKNN_CMD,
+    MNN_CORRECT_CMD,
 )
 
 
@@ -109,6 +110,7 @@ def integrate():
 
 integrate.add_command(HARMONY_INTEGRATE_CMD)
 integrate.add_command(BBKNN_CMD)
+integrate.add_command(MNN_CORRECT_CMD)
 
 
 @cli.group(cls=NaturalOrderGroup)
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index e2d6219d..e02ccc1f 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1380,6 +1380,108 @@
         COMMON_OPTIONS['random_state'],
     ],
 
+    'mnn_correct': [
+        *COMMON_OPTIONS['input'],
+        *COMMON_OPTIONS['output'],
+        click.option(
+            '--layer', '-l',
+            type=click.STRING,
+            default=None,
+            show_default=True,
+            help="Layer to batch correct. By default corrects the contents of .X."
+        ),
+        click.option(
+            '--key-added',
+            type=click.STRING,
+            default=None,
+            show_default=True,
+            help="Key under which to add the computed results. By default a new "
+            "layer will be created called 'mnnnn', 'bbknn_{layer}' or "
+            "'bbknn_layer_{key_added}' where those parameters were specified. A value of 'X' "
+            "causes batch-corrected values to overwrite the original content of .X."
+        ),
+        click.option(
+            '--var-subset',
+            type=(click.STRING, CommaSeparatedText()),
+            multiple=True,
+            help="The subset of vars (list of str) to be used when performing "
+            "MNN correction in the format of '--var-subset <name> <values>'. Typically, use "
+            "the highly variable genes (HVGs) like '--var-subset highly_variable True'. When "
+            "unset, uses all vars."
+        ),
+        click.option(
+            '--batch-key', 'batch_key',
+            type=click.STRING,
+            required=True,
+            help='adata.obs column name discriminating between your batches.'
+        ),
+        click.option(
+            '--n-neighbors', '-k',
+            type=CommaSeparatedText(click.INT, simplify=True),
+            default=20,
+            show_default=True,
+            help='Number of mutual nearest neighbors.'
+        ),
+        click.option(
+            '--sigma',
+            type=click.FLOAT,
+            default=1.0,
+            show_default=True,
+            help='The bandwidth of the Gaussian smoothing kernel used to '
+            'compute the correction vectors.'
+        ),
+        click.option(
+            '--no-cos_norm_in', 'cos_norm_in',
+            is_flag=True,
+            default=True,
+            help='Default behaviour is to perform cosine normalization on the '
+            'input data prior to calculating distances between cells. Use this '
+            'flag to disable that behaviour.'
+        ),
+        click.option(
+            '--no-cos_norm_out', 'cos_norm_out',
+            is_flag=True,
+            default=True,
+            help='Default behaviour is to perform cosine normalization prior to '
+            'computing corrected expression values. Use this flag to disable that '
+            'behaviour.'
+        ),
+        click.option(
+            '--svd-dim',
+            type=click.INT,
+            default=None,
+            show_default=True,
+            help='The number of dimensions to use for summarizing biological '
+            'substructure within each batch. If not set, biological components '
+            'will not be removed from the correction vectors.'
+        ),
+        click.option(
+            '--no-var-adj',
+            is_flag=True,
+            default=True,
+            help='Default behaviour is to adjust variance of the correction '
+            'vectors. Use this flag to disable that behaviour. Note this step takes most '
+            'computing time.'
+        ),
+        click.option(
+            '--compute-angle',
+            is_flag=True,
+            default=False,
+            help='When set, compute the angle between each cell’s correction '
+            'vector and the biological subspace of the reference batch.'
+        ),
+        click.option(
+            '--svd-mode',
+            type=click.Choice(['svd', 'rsvd', 'irlb']),
+            default='rsvd',
+            show_default=True,
+            help="'svd' computes SVD using a non-randomized SVD-via-ID "
+            "algorithm, while 'rsvd' uses a randomized version. 'irlb' performs truncated "
+            "SVD by implicitly restarted Lanczos bidiagonalization (forked from "
+            "https://github.com/airysen/irlbpy)."
+        ),
+    ],
+
     'bbknn': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py
index 7c94a503..1cd32732 100644
--- a/scanpy_scripts/cmds.py
+++ b/scanpy_scripts/cmds.py
@@ -27,6 +27,7 @@
 from .lib._diffmap import diffmap
 from .lib._dpt import dpt
 from .lib._bbknn import bbknn
+from .lib._mnn import mnn_correct
 
 LANG = os.environ.get('LANG', None)
 
@@ -226,3 +227,10 @@
     cmd_desc='Batch balanced kNN [Polanski19].',
     arg_desc=_IO_DESC,
 )
+
+MNN_CORRECT_CMD = make_subcmd(
+    'mnn_correct',
+    mnn_correct,
+    cmd_desc='Correct batch effects by matching mutual nearest neighbors [Haghverdi18] [Kang18].',
+    arg_desc=_IO_DESC,
+)
diff --git a/scanpy_scripts/lib/_mnn.py b/scanpy_scripts/lib/_mnn.py
new file mode 100644
index 00000000..c84358ab
--- /dev/null
+++ b/scanpy_scripts/lib/_mnn.py
@@ -0,0 +1,85 @@
+"""
+scanpy external mnn
+"""
+
+import scanpy.external as sce
+import numpy as np 
+import click
+
+# Wrapper for mnn allowing use of non-standard slot
+
+def mnn_correct(adata, batch_key=None, key_added=None, var_subset=None, layer=None, **kwargs):
+    """
+    Wrapper function for sce.pp.mnn_correct(), for supporting non-standard neighbors slot
+    """
+
+    # mnn will use .X, so we need to put other layers there for processing
+
+    if layer:
+        adata.layers['X_backup'] = adata.X
+        adata.X = adata.layers[layer]
+
+    # mnn_correct() wants batches in separate adatas
+
+    batches = np.unique(adata.obs[batch_key])
+    alldata = []
+    for batch in batches:
+        alldata.append( adata[adata.obs[batch_key] == batch,] )
+
+    # Process var_subset into a list of strings that can be provided to
+    # mnn_correct()
+
+    if var_subset is not None and len(var_subset) > 0 and var_subset[0] is not None:
+   
+        subset = []
+ 
+        for name, values in var_subset :
+            if name in adata.var:
+                if adata.var[name].dtype == 'bool':
+                    values = [ True if x.lower() == "true" else x for x in values  ]
+            else:
+                raise click.ClickException(f'Var "{name}" unavailable')
+
+            ind = [ x in values for x in adata.var[name]  ]
+            subset = subset + adata.var.index[ ind  ].to_list()
+
+        var_subset = set(subset)
+        print('Will use %d selected genes for MNN' % len(var_subset))
+
+    else:
+        var_subset = None
+
+    # Here's the main bit
+
+    cdata = sce.pp.mnn_correct(*alldata, var_subset = var_subset, do_concatenate = True, index_unique = None, **kwargs)    
+    
+    # If user has specified key_added = X then they want us to overwrite .X,
+    # othwerwise copy the .X to a named layer of the original object. In either
+    # case make sure obs and var are the same as the original.
+
+    if key_added is None or key_added != 'X':
+
+        mnn_key = 'mnn'
+        if layer:
+            mnn_key = f"{mnn_key}_{layer}"
+        
+            # Layers is set (so we're not storing computed results in the .X,
+            # and we had to overwrite .X to run mnn), and key_added shows we're
+            # not storing in the .X, so we need to restore from the backup.
+
+            adata.X = adata.layers['X_backup']
+
+        if key_added:
+            mnn_key = f"{mnn_key}_{key_added}"
+   
+        adata.layers[mnn_key] = cdata[0][adata.obs.index, adata.var.index].X
+
+    else:
+        adata.X = cdata[0][adata.obs.index, adata.var.index].X
+
+    # Delete the backup of .X if we needed one 
+
+    if layer:
+        del adata.layers['X_backup'] 
+
+    return adata

From 4fb0b96af4bd307b7fdfd2ea6ec4288876233bb1 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Thu, 13 Aug 2020 19:37:53 +0100
Subject: [PATCH 21/29] add mnnpy to deps

---
 setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c94b1ce3..3760c759 100644
--- a/setup.py
+++ b/setup.py
@@ -49,6 +49,7 @@
         'Click',
         'umap-learn<0.4.0',
         'harmonypy>=0.0.5',
-        'bbknn>=1.3.12'
+        'bbknn>=1.3.12',
+        'mnnpy>=0.1.9.5'
     ],
 )

From 924a74599ba74b3fcc4b51bede2118b800eeee1b Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Fri, 14 Aug 2020 15:22:01 +0100
Subject: [PATCH 22/29] Minor fixes, add ComBat, unify args

---
 scanpy-scripts-tests.bats     | 17 +++++++-
 scanpy_scripts/cli.py         |  2 +
 scanpy_scripts/cmd_options.py | 82 ++++++++++++++++++++++-------------
 scanpy_scripts/cmds.py        |  8 ++++
 scanpy_scripts/lib/_bbknn.py  |  4 +-
 scanpy_scripts/lib/_mnn.py    |  6 +--
 6 files changed, 83 insertions(+), 36 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index ec12d447..0839d19a 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -79,6 +79,8 @@ setup() {
     bbknn_opt="--batch-key ${test_clustering} --key-added bbknn"
     mnn_obj="${output_dir}/mnn.h5ad"
     mnn_opt="--batch-key ${test_clustering}"
+    combat_obj="${output_dir}/combat.h5ad"
+    combat_opt="--batch-key ${test_clustering}"
     
 
     if [ ! -d "$data_dir" ]; then
@@ -504,10 +506,9 @@ setup() {
 
     [ "$status" -eq 0 ]
     [ -f  "$plt_rank_genes_groups_matrix_pdf" ]
-
 }
 
-# Do bbknn batch correction, using clustering as batch (just for test purposes)
+# Do MNN batch correction, using clustering as batch (just for test purposes)
 
 @test "Run MNN batch integration using clustering as batch" {
     if [ "$resume" = 'true' ] && [ -f "$mnn_obj" ]; then
@@ -518,7 +519,19 @@ setup() {
 
     [ "$status" -eq 0 ]
     [ -f  "$mnn_obj" ]
+}
+
+# Do ComBat batch correction, using clustering as batch (just for test purposes)
 
+@test "Run Combat batch integration using clustering as batch" {
+    if [ "$resume" = 'true' ] && [ -f "$combat_obj" ]; then
+        skip "$combat_obj exists and resume is set to 'true'"
+    fi
+
+    run rm -f $combat_obj && eval "$scanpy integrate combat $combat_opt $louvain_obj $combat_obj"
+
+    [ "$status" -eq 0 ]
+    [ -f  "$combat_obj" ]
 }
 
 # Local Variables:
diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py
index 0fee8b9f..b5ed0bfc 100755
--- a/scanpy_scripts/cli.py
+++ b/scanpy_scripts/cli.py
@@ -33,6 +33,7 @@
     HARMONY_INTEGRATE_CMD,
     BBKNN_CMD,
     MNN_CORRECT_CMD,
+    COMBAT_CMD,
 )
 
 
@@ -111,6 +112,7 @@ def integrate():
 integrate.add_command(HARMONY_INTEGRATE_CMD)
 integrate.add_command(BBKNN_CMD)
 integrate.add_command(MNN_CORRECT_CMD)
+integrate.add_command(COMBAT_CMD)
 
 
 @cli.group(cls=NaturalOrderGroup)
diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index e02ccc1f..1185a96e 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -503,7 +503,23 @@
             help='Key for categorical in `.obs`. You can pass your predefined '
             'groups by choosing any categorical annotation of observations.',
         ),
-    ]
+    ],
+
+    'batch_key': click.option(
+        '--batch-key', 'key',
+        type=click.STRING,
+        required=True,
+        help='The name of the column in adata.obs that differentiates among '
+        'experiments/batches.'
+    ),
+    
+    'batch_layer': click.option(
+        '--layer', '-l',
+        type=click.STRING,
+        default=None,
+        show_default=True,
+        help="Layer to batch correct. By default corrects the contents of .X."
+    ),
 }
 
 CMD_OPTIONS = {
@@ -1269,9 +1285,40 @@
         ),
     ],
 
+    'combat': [
+        *COMMON_OPTIONS['input'],
+        *COMMON_OPTIONS['output'],
+        COMMON_OPTIONS['batch_key'],
+        COMMON_OPTIONS['batch_layer'],
+        click.option(
+            '--key-added',
+            type=click.STRING,
+            default=None,
+            show_default=True,
+            help="Key under which to add the computed results. By default a new "
+            "layer will be created called 'combat', 'combat_{layer}' or "
+            "'combat_layer_{key_added}' where those parameters were specified. A value of 'X' "
+            "causes batch-corrected values to overwrite the original content of .X."
+        ),
+        click.option(
+            '--covariates',
+            type=(CommaSeparatedText()),
+            default=None,
+            show_default=True,
+            help="Comma-separated list of additional covariates besides the "
+            "batch variable such as adjustment variables or biological condition. This "
+            "parameter refers to the design matrix X in Equation 2.1 in [Johnson07] and to "
+            "the mod argument in the original combat function in the sva R package.  Note "
+            "that not including covariates may introduce bias or lead to the removal of "
+            "biological signal in unbalanced designs."
+        ),
+        
+    ],
+
     'harmony_integrate': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
+        COMMON_OPTIONS['batch_key'],
         click.option(
             '--basis',
             type=click.STRING,
@@ -1280,13 +1327,6 @@
             help="The name of the field in adata.obsm where the PCA table is "
             "stored. Defaults to 'X_pca', which is the default for sc.tl.pca()."
         ),
-        click.option(
-            '--batch-key', 'key',
-            type=click.STRING,
-            required=True,
-            help='The name of the column in adata.obs that differentiates among '
-            'experiments/batches.'
-        ),
         click.option(
             '--adjusted-basis',
             type=click.STRING,
@@ -1383,21 +1423,16 @@
     'mnn_correct': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
-        click.option(
-            '--layer', '-l',
-            type=click.STRING,
-            default=None,
-            show_default=True,
-            help="Layer to batch correct. By default corrects the contents of .X."
-        ),
+        COMMON_OPTIONS['batch_key'],
+        COMMON_OPTIONS['batch_layer'],
         click.option(
             '--key-added',
             type=click.STRING,
             default=None,
             show_default=True,
             help="Key under which to add the computed results. By default a new "
-            "layer will be created called 'mnnnn', 'bbknn_{layer}' or "
-            "'bbknn_layer_{key_added}' where those parameters were specified. A value of 'X' "
+            "layer will be created called 'mnn', 'mnn_{layer}' or "
+            "'mnn_layer_{key_added}' where those parameters were specified. A value of 'X' "
             "causes batch-corrected values to overwrite the original content of .X."
         ),
         click.option(
@@ -1409,12 +1444,6 @@
             "the highly variable genes (HVGs) like '--var-subset highly_variable True'. When "
             "unset, uses all vars."
         ),
-        click.option(
-            '--batch-key', 'batch_key',
-            type=click.STRING,
-            required=True,
-            help='adata.obs column name discriminating between your batches.'
-        ),
         click.option(
             '--n-neighbors', '-k',
             type=CommaSeparatedText(click.INT, simplify=True),
@@ -1486,6 +1515,7 @@
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
         COMMON_OPTIONS['key_added'],
+        COMMON_OPTIONS['batch_key'],
         click.option(
             '--use-rep', '-u',
             type=click.STRING,
@@ -1495,12 +1525,6 @@
             'detection.'
         ),
         COMMON_OPTIONS['use_pc'][0], # --n-pcs
-        click.option(
-            '--batch-key', 'batch_key',
-            type=click.STRING,
-            required=True,
-            help='adata.obs column name discriminating between your batches.'
-        ),
         click.option(
             '--no-approx', 'approx',
             is_flag=True,
diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py
index 1cd32732..5b9c5470 100644
--- a/scanpy_scripts/cmds.py
+++ b/scanpy_scripts/cmds.py
@@ -28,6 +28,7 @@
 from .lib._dpt import dpt
 from .lib._bbknn import bbknn
 from .lib._mnn import mnn_correct
+from .lib._combat import combat
 
 LANG = os.environ.get('LANG', None)
 
@@ -214,6 +215,13 @@
     opt_set='plot_paga'
 )
 
+COMBAT_CMD = make_subcmd(
+    'combat',
+    combat,
+    cmd_desc='ComBat function for batch effect correction',
+    arg_desc=_IO_DESC
+)
+
 HARMONY_INTEGRATE_CMD = make_subcmd(
     'harmony_integrate',
     sce.pp.harmony_integrate,
diff --git a/scanpy_scripts/lib/_bbknn.py b/scanpy_scripts/lib/_bbknn.py
index 4834dae9..a10cb3a0 100644
--- a/scanpy_scripts/lib/_bbknn.py
+++ b/scanpy_scripts/lib/_bbknn.py
@@ -12,13 +12,13 @@
 
 # Wrapper for bbknn allowing use of non-standard slot
 
-def bbknn(adata, key_added=None, **kwargs):
+def bbknn(adata, key=None, key_added=None, **kwargs):
     """
     Wrapper function for sce.pp.bbknn(), for supporting non-standard neighbors slot
     """
 
     _backup_default_key(adata.uns, 'neighbors')
-    sce.pp.bbknn(adata, **kwargs)    
+    sce.pp.bbknn(adata, batch_key = key, **kwargs)    
 
     if key_added:
         _rename_default_key(adata.uns, 'neighbors',  f'neighbors_{key_added}')
diff --git a/scanpy_scripts/lib/_mnn.py b/scanpy_scripts/lib/_mnn.py
index c84358ab..0a559a64 100644
--- a/scanpy_scripts/lib/_mnn.py
+++ b/scanpy_scripts/lib/_mnn.py
@@ -8,7 +8,7 @@
 
 # Wrapper for mnn allowing use of non-standard slot
 
-def mnn_correct(adata, batch_key=None, key_added=None, var_subset=None, layer=None, **kwargs):
+def mnn_correct(adata, key=None, key_added=None, var_subset=None, layer=None, **kwargs):
     """
     Wrapper function for sce.pp.mnn_correct(), for supporting non-standard neighbors slot
     """
@@ -21,10 +21,10 @@ def mnn_correct(adata, batch_key=None, key_added=None, var_subset=None, layer=No
 
     # mnn_correct() wants batches in separate adatas
 
-    batches = np.unique(adata.obs[batch_key])
+    batches = np.unique(adata.obs[key])
     alldata = []
     for batch in batches:
-        alldata.append( adata[adata.obs[batch_key] == batch,] )
+        alldata.append( adata[adata.obs[key] == batch,] )
 
     # Process var_subset into a list of strings that can be provided to
     # mnn_correct()

From 79ad11cf5c7fd2a0582f474d7690006756c93733 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Fri, 14 Aug 2020 17:05:11 +0100
Subject: [PATCH 23/29] Set batch handling to scanpy-native method for hvg, to
 properly respond to flavor = 'seurat_v3'

---
 scanpy_scripts/cmd_options.py | 26 ++++++++++++++-----
 scanpy_scripts/lib/_hvg.py    | 48 ++++++++---------------------------
 2 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index 1185a96e..f0dbf7e6 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -713,6 +713,14 @@
             help='Cutoffs for the dispersion of expression'
             'in the format of "-d min max".',
         ),
+        click.option(
+            '--span',
+            type=click.FLOAT,
+            default=0.3,
+            show_default=True,
+            help="The fraction of the data (cells) used when estimating the "
+            "variance in the loess model fit if flavor='seurat_v3'."    
+        ),
         click.option(
             '--n-bins', '-b',
             type=click.INT,
@@ -729,7 +737,7 @@
         ),
         click.option(
             '--flavor', '-v',
-            type=click.Choice(['seurat', 'cellranger']),
+            type=click.Choice(['seurat', 'cellranger', 'seurat_v3']),
             default='seurat',
             show_default=True,
             help='Choose the flavor for computing normalized dispersion.',
@@ -742,12 +750,16 @@
             'only flag highly-variable genes.',
         ),
         click.option(
-            '--by-batch', '-B',
-            type=(click.STRING, click.INT),
-            default=(None, None),
-            show_default=True,
-            help='Find highly variable genes within each batch defined by <TEXT> '
-            'then pool and keep those found in at least <INTEGER> batches.',
+            '--batch-key', 'batch_key',
+            type=click.STRING,
+            default=None,
+            help="If specified, highly-variable genes are selected within each "
+            "batch separately and merged. This simple process avoids the selection of "
+            "batch-specific genes and acts as a lightweight batch correction method. For all "
+            "flavors, genes are first sorted by how many batches they are a HVG. For "
+            "dispersion-based flavors ties are broken by normalized dispersion. If flavor = "
+            "'seurat_v3', ties are broken by the median (across batches) rank based on "
+            "within-batch normalized variance."
         ),
     ],
 
diff --git a/scanpy_scripts/lib/_hvg.py b/scanpy_scripts/lib/_hvg.py
index a4101729..3cd53b73 100644
--- a/scanpy_scripts/lib/_hvg.py
+++ b/scanpy_scripts/lib/_hvg.py
@@ -9,50 +9,24 @@ def hvg(
         adata,
         mean_limits=(0.0125, 3),
         disp_limits=(0.5, float('inf')),
-        subset=False,
-        by_batch=None,
         **kwargs,
 ):
     """
-    Wrapper function for sc.highly_variable_genes(), mainly to support searching
-    by batch and pooling.
+    Wrapper function for sc.highly_variable_genes()
     """
 
     # Check for n_top_genes beeing greater than the total genes
 
     if 'n_top_genes' in kwargs and kwargs['n_top_genes'] is not None:
         kwargs['n_top_genes'] = min(adata.n_vars, kwargs['n_top_genes'])
-
-    if by_batch and isinstance(by_batch, (list, tuple)) and by_batch[0]:
-        batch_name = by_batch[0]
-        min_n = by_batch[1]
-        k_hvg = np.zeros(adata.shape[1], dtype=int)
-        for batch_value in adata.obs[batch_name].unique():
-            k_v = adata.obs[batch_name] == batch_value
-            ad = adata[k_v, :]
-            k_f, _ = sc.pp.filter_genes(ad, min_cells=3, inplace=False)
-            hvg = sc.pp.highly_variable_genes(
-                ad[:, k_f],
-                min_mean=mean_limits[0],
-                max_mean=mean_limits[1],
-                min_disp=disp_limits[0],
-                max_disp=disp_limits[1],
-                inplace=False,
-                **kwargs,
-            )['highly_variable']
-            k_hvg[k_f] += hvg
-        if subset:
-            adata = adata[:, k_hvg >= min_n]
-        else:
-            adata.var['highly_variable'] = k_hvg >= min_n
-    else:
-        sc.pp.highly_variable_genes(
-            adata,
-            min_mean=mean_limits[0],
-            max_mean=mean_limits[1],
-            min_disp=disp_limits[0],
-            max_disp=disp_limits[1],
-            subset=subset,
-            **kwargs,
-        )
+    
+    sc.pp.highly_variable_genes(
+        adata,
+        min_mean=mean_limits[0],
+        max_mean=mean_limits[1],
+        min_disp=disp_limits[0],
+        max_disp=disp_limits[1],
+        **kwargs,
+    )
+    
     return adata

From 8dde17005afdff5ad393e009ccc26f9dc522ef7c Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Fri, 14 Aug 2020 17:15:31 +0100
Subject: [PATCH 24/29] Add new rank_genes_groups options

---
 scanpy_scripts/cmd_options.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index f0dbf7e6..e111daba 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1180,6 +1180,21 @@
             help='Rank genes by the absolute value of the score, not by the score. '
             'The returned scores are never the absolute values.',
         ),
+        click.option(
+            '--pts',
+            is_flag=True,
+            default=False,
+            show_default=True,
+            help='Compute the fraction of cells expressing the genes.'
+        ),
+        click.option(
+            '--tie-correct',
+            is_flag=True,
+            default=False,
+            show_default=True,
+            help="Use tie correction for 'wilcoxon' scores. Used only for "
+            "'wilcoxon'."
+        ),
         click.option(
             '--filter-params',
             type=Dictionary(keys=[

From 494ff24f67975d138e58413ab2522fca74283745 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 18 Aug 2020 11:15:07 +0100
Subject: [PATCH 25/29] Bump Scanpy

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3760c759..de3a2f82 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@
         'matplotlib',
         'pandas',
         'h5py',
-        'scanpy>=1.5.1',
+        'scanpy>=1.6.0',
         'louvain',
         'leidenalg',
         'loompy>=2.0.0,<3.0.0',

From d50cb627b471d6b1936f09e3ccdb6e71b6cb046d Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 18 Aug 2020 11:41:36 +0100
Subject: [PATCH 26/29] Whoops- forgot to add the combat file

---
 scanpy_scripts/lib/_combat.py | 51 +++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 scanpy_scripts/lib/_combat.py

diff --git a/scanpy_scripts/lib/_combat.py b/scanpy_scripts/lib/_combat.py
new file mode 100644
index 00000000..7ef65bea
--- /dev/null
+++ b/scanpy_scripts/lib/_combat.py
@@ -0,0 +1,51 @@
+"""
+scanpy combat
+"""
+
+import scanpy as sc
+
+# Wrapper for mnn allowing use of non-standard slot
+
+def combat(adata, key=None, key_added=None, layer=None, **kwargs):
+    """
+    Wrapper function for scanpy.pp.combat(), for supporting non-standard slots
+    """
+
+    # If layer is set then we have to move the contents of that layer into
+    # .X for analysis. We back up the original .X, but only if the user hasn't
+    # specified to overwrite it anyway.
+
+    if layer:
+        if key_added and key_added != 'X':
+            adata.layers['X_backup'] = adata.X
+
+        adata.X = adata.layers[layer]
+    
+    # If we're storing results in .X (whether from .X or from a layer), run in
+    # place to save copying objects.
+
+    if key_added and key_added == 'X':
+        sc.pp.combat(adata, key = key, **kwargs)    
+    
+    # If we're storing in 'layers' (key_added is not set, or is not X, then
+    # don't run in place, and put the matrix in the specified layer. 
+
+    else:
+
+        cdata = sc.pp.combat(adata, key=key, inplace = False, **kwargs)    
+            
+        combat_key = 'combat'
+        if layer:
+            combat_key = f"{combat_key}_{layer}"
+        
+            # If we ran from a layer, restore the .X we had to overwrite
+            
+            adata.X = adata.layers['X_backup']    
+            del adata.layers['X_backup'] 
+            
+        if key_added:
+            combat_key = f"{combat_key}_{key_added}"
+
+        adata.layers[combat_key] = cdata
+
+    return adata

From c41469325e14f9c5770417c282b7d7001780c761 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Tue, 18 Aug 2020 13:48:00 +0100
Subject: [PATCH 27/29] Add help text with integration methods

---
 README.md | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 8f7fb640..8558a004 100644
--- a/README.md
+++ b/README.md
@@ -37,18 +37,19 @@ Options:
   --help               Show this message and exit.
 
 Commands:
-  read      Read 10x data and save in specified format.
-  filter    Filter data based on specified conditions.
-  norm      Normalise data per cell.
-  hvg       Find highly variable genes.
-  scale     Scale data per gene.
-  regress   Regress-out observation variables.
-  pca       Dimensionality reduction by PCA.
-  neighbor  Compute a neighbourhood graph of observations.
-  embed     Embed cells into two-dimensional space.
-  cluster   Cluster cells into sub-populations.
-  diffexp   Find markers for each clusters.
-  paga      Trajectory inference by abstract graph analysis.
-  dpt       Calculate diffusion pseudotime relative to the root cells.
-  plot      Visualise data.
+  read       Read 10x data and save in specified format.
+  filter     Filter data based on specified conditions.
+  norm       Normalise data per cell.
+  hvg        Find highly variable genes.
+  scale      Scale data per gene.
+  regress    Regress-out observation variables.
+  pca        Dimensionality reduction by PCA.
+  neighbor   Compute a neighbourhood graph of observations.
+  embed      Embed cells into two-dimensional space.
+  cluster    Cluster cells into sub-populations.
+  diffexp    Find markers for each clusters.
+  paga       Trajectory inference by abstract graph analysis.
+  dpt        Calculate diffusion pseudotime relative to the root cells.
+  integrate  Integrate cells from different experimental batches.
+  plot       Visualise data.
   ```

From 987d65505041ca0e8d086294b483b1cb50630808 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Sun, 23 Aug 2020 21:46:26 +0100
Subject: [PATCH 28/29] Simplify commands

---
 scanpy-scripts-tests.bats | 4 ++--
 scanpy_scripts/cmds.py    | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats
index 28ce02aa..4c846ce0 100755
--- a/scanpy-scripts-tests.bats
+++ b/scanpy-scripts-tests.bats
@@ -462,7 +462,7 @@ setup() {
         skip "$harmony_integrate_obj exists and resume is set to 'true'"
     fi
 
-    run rm -f $harmony_integrate_obj && eval "$scanpy integrate harmony_integrate $harmony_integrate_opt $louvain_obj $harmony_integrate_obj"
+    run rm -f $harmony_integrate_obj && eval "$scanpy integrate harmony $harmony_integrate_opt $louvain_obj $harmony_integrate_obj"
 
     [ "$status" -eq 0 ]
     [ -f  "$plt_rank_genes_groups_matrix_pdf" ]
@@ -515,7 +515,7 @@ setup() {
         skip "$mnn_obj exists and resume is set to 'true'"
     fi
 
-    run rm -f $mnn_obj && eval "$scanpy integrate mnn_correct $mnn_opt $louvain_obj $mnn_obj"
+    run rm -f $mnn_obj && eval "$scanpy integrate mnn $mnn_opt $louvain_obj $mnn_obj"
 
     [ "$status" -eq 0 ]
     [ -f  "$mnn_obj" ]
diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py
index 5b9c5470..b955f68b 100644
--- a/scanpy_scripts/cmds.py
+++ b/scanpy_scripts/cmds.py
@@ -223,7 +223,7 @@
 )
 
 HARMONY_INTEGRATE_CMD = make_subcmd(
-    'harmony_integrate',
+    'harmony',
     sce.pp.harmony_integrate,
     cmd_desc='Use harmonypy [Korunsky19] to integrate different experiments.',    
     arg_desc=_IO_DESC,
@@ -237,7 +237,7 @@
 )
 
 MNN_CORRECT_CMD = make_subcmd(
-    'mnn_correct',
+    'mnn',
     mnn_correct,
     cmd_desc='Correct batch effects by matching mutual nearest neighbors [Haghverdi18] [Kang18].',
     arg_desc=_IO_DESC,

From dc613d9df567e4613a6b1b820e29c85c823fb0d3 Mon Sep 17 00:00:00 2001
From: Jonathan Manning <jmanning@ebi.ac.uk>
Date: Sun, 23 Aug 2020 21:50:15 +0100
Subject: [PATCH 29/29] Finish: simplify commands

---
 scanpy_scripts/cmd_options.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py
index df68b6da..03e16d6b 100644
--- a/scanpy_scripts/cmd_options.py
+++ b/scanpy_scripts/cmd_options.py
@@ -1342,7 +1342,7 @@
         
     ],
 
-    'harmony_integrate': [
+    'harmony': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
         COMMON_OPTIONS['batch_key'],
@@ -1447,7 +1447,7 @@
         COMMON_OPTIONS['random_state'],
     ],
 
-    'mnn_correct': [
+    'mnn': [
         *COMMON_OPTIONS['input'],
         *COMMON_OPTIONS['output'],
         COMMON_OPTIONS['batch_key'],
@@ -1893,4 +1893,4 @@
         COMMON_OPTIONS['swap_axes'],
     ],
 
-}
\ No newline at end of file
+}