From 83adae3d3100995c8d5835a125a8f4d2bd83537d Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 4 Aug 2020 18:15:18 +0100 Subject: [PATCH 01/29] Correct scatter function usage and normalisation step --- scanpy-scripts-tests.bats | 2 +- scanpy_scripts/cmd_options.py | 42 ++++++++++++++++++++++++++++++----- scanpy_scripts/cmd_utils.py | 2 +- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index fe1f78d6..07757cdb 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -14,7 +14,7 @@ setup() { filter_opt="-p n_genes 200 2500 -p c:n_counts 0 50000 -p n_cells 3 inf -p pct_counts_mito 0 0.2 -c mito '!True' --show-obj stdout" filter_obj="${output_dir}/filter.h5ad" norm_mtx="${output_dir}/norm" - norm_opt="-r yes -t 10000 -X ${norm_mtx} --show-obj stdout" + norm_opt="-r yes -t 10000 -l all -n after -X ${norm_mtx} --show-obj stdout" norm_obj="${output_dir}/norm.h5ad" hvg_opt="-m 0.0125 3 -d 0.5 inf -s --show-obj stdout" hvg_obj="${output_dir}/hvg.h5ad" diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 39febb2d..d181d67b 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -565,6 +565,7 @@ 'norm': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], + COMMON_OPTIONS['key_added'], click.option( '--save-raw', '-r', type=click.Choice(['yes', 'no', 'counts']), @@ -587,13 +588,44 @@ help='Normalize per cell nUMI to this number.', ), click.option( - '--fraction', + '--exclude-highly-expressed', '-e', 'exclude_highly_expressed', + is_flag=True, + default=False, + show_default=True, + help='Exclude (very) highly expressed genes for the computation of ' + 'the normalization factor (size factor) for each cell. A gene is considered ' + 'highly expressed, if it has more than max_fraction of the total counts in at ' + 'least one cell. The not-excluded genes will sum up to the number ' + 'specified by --normalize-to.' + ), + click.option( + '--max-fraction', '-m', 'max_fraction', type=float, - default=0.9, + default=0.05, + show_default=True, + help='If exclude_highly_expressed=True, consider cells as highly ' + 'expressed that have more counts than max_fraction of the original total counts ' + 'in at least one cell.' + ), + click.option( + '--layers', '-l', + type=CommaSeparatedText(simplify=True), + default=None, + show_default=True, + help="List of layers to normalize. Set to 'all' to normalize all layers." + ), + click.option( + '--layer-norm', '-n', 'layer_norm', + type=click.Choice(['after', 'X']), + default=None, show_default=True, - help='Only use genes that make up less than this fraction of the total ' - 'count in every cell. So only these genes will sum up to the number ' - 'specified by --normalize-to.', + help="Specifies how to normalize layers: 1) If None, after " + "normalization, for each layer in layers each cell has a total count equal to " + "the median of the counts_per_cell before normalization of the layer. 2) If " + "'after', for each layer in layers each cell has a total count equal to " + "target_sum. 3) If 'X', for each layer in layers each cell has a total count " + "equal to the median of total counts for observations (cells) of adata.X before " + "normalization.'" ), ], diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py index a1e88f2b..df2b7ca1 100644 --- a/scanpy_scripts/cmd_utils.py +++ b/scanpy_scripts/cmd_utils.py @@ -159,7 +159,7 @@ def make_plot_function(func_name, kind=None): # Provide a function translation plot_funcs = { - 'scatter': sc.plotting._tools.scatterplots.plot_scatter, + 'scatter': sc.pl.scatter, 'sviol': sc.pl.stacked_violin, 'rgg_sviol': sc.pl.rank_genes_groups_stacked_violin, 'dot': sc.pl.dotplot, From c3f4d644d23bb725d0ce92a4c1756db83fa99128 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 4 Aug 2020 18:20:08 +0100 Subject: [PATCH 02/29] Relax most pins --- setup.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 336fa217..536cc5e4 100644 --- a/setup.py +++ b/setup.py @@ -36,12 +36,12 @@ ), install_requires=[ 'packaging', - 'anndata<0.6.20', - 'scipy>=1.2.0,<1.3.0', + 'anndata', + 'scipy', 'matplotlib', 'pandas', - 'h5py<2.10', - 'scanpy>=1.4.2,<1.4.4', + 'h5py', + 'scanpy>=1.5.1', 'louvain', 'leidenalg', 'loompy>=2.0.0,<3.0.0', From 38701f1799dfcb03fa7a38050faaa29fa2627e57 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 5 Aug 2020 10:55:13 +0100 Subject: [PATCH 03/29] Add some missing options --- scanpy-scripts-tests.bats | 2 +- scanpy_scripts/cmd_options.py | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 07757cdb..ab2ecca8 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -25,7 +25,7 @@ setup() { pca_embed="${output_dir}/pca.tsv" pca_opt="--n-comps 50 -V auto --show-obj stdout -E ${pca_embed}" pca_obj="${output_dir}/pca.h5ad" - neighbor_opt="-k 5,10,20 -n 25 -m umap --show-obj stdout" + neighbor_opt="-k 5,10,20 -n 25 -m umap -t euclidean --show-obj stdout" neighbor_obj="${output_dir}/neighbor.h5ad" tsne_embed="${output_dir}/tsne.tsv" tsne_opt="-n 25 --use-rep X_pca --learning-rate 200 -E ${tsne_embed}" diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index d181d67b..0078e850 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -700,6 +700,12 @@ help='When specified, clip to this value after scaling, otherwise do ' 'not clip', ), + click.option( + '--layer', '-l', + type=CommaSeparatedText(simplify=True), + default=None, + help="If provided, which element of layers to scale." + ), ], 'regress': [ @@ -788,11 +794,19 @@ ), click.option( '--method', '-m', - type=click.Choice(['umap', 'gauss']), + type=click.Choice(['umap', 'gauss', 'rapids']), default='umap', show_default=True, help='Use umap or gauss with adaptive width for computing ' - 'connectivities.' + 'connectivities. Use rapids for the RAPIDS implementation of UMAP ' + '(experimental, GPU only).' + ), + click.option( + '--metric', '-t', + type=click.Choice(['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']), + default='euclidean', + show_default=True, + help='A known metric’s name.' ), ], From 10171a5d8b56bf4f506f8d4d26fd07a6fada037f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 5 Aug 2020 11:00:42 +0100 Subject: [PATCH 04/29] Add method argument for umap --- scanpy_scripts/cmd_options.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 0078e850..208b6518 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -879,6 +879,14 @@ help='The number of negative edge samples to use per positive edge ' 'sample in optimizing the low dimensional embedding.', ), + click.option( + '--method', + type=click.Choice(['umap', 'rapids']), + default='umap', + show_default=True, + help='Use the original ‘umap’ implementation, or ‘rapids’ ' + '(experimental, GPU only).' + ), ], 'tsne': [ From f4a0b6b359bef81608657cb12c0e157da0e6b356 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Wed, 5 Aug 2020 18:19:02 +0100 Subject: [PATCH 05/29] Fix fdg call to use scanpy-native functionality for slot specification --- scanpy-scripts-tests.bats | 2 +- scanpy_scripts/cmd_options.py | 41 +++++++++++++++++++++++++++++++---- scanpy_scripts/lib/_fdg.py | 21 ++++-------------- 3 files changed, 42 insertions(+), 22 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index ab2ecca8..43abf11b 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -34,7 +34,7 @@ setup() { umap_opt="--use-graph neighbors_k10 --min-dist 0.75 --alpha 1 --gamma 1 -E ${umap_embed}" umap_obj="${output_dir}/umap.h5ad" fdg_embed="${output_dir}/fdg.tsv" - fdg_opt="--use-graph neighbors_k10 --layout fr -E ${fdg_embed}" + fdg_opt="--neighbors-key neighbors_k10 --layout fr -E ${fdg_embed}" fdg_obj="${output_dir}/fdg.h5ad" louvain_opt="-r 0.5,1 --use-graph neighbors_k10 --key-added k10" louvain_obj="${output_dir}/louvain.h5ad" diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 208b6518..de3149a9 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -456,7 +456,15 @@ show_default=True, help='Number of genes to show.' ), - ] + ], + + 'root': click.option( + '--root', + type=click.INT, + default=0, + show_default=True, + help='If choosing a tree layout, this is the index of the root node.', + ), } CMD_OPTIONS = { @@ -946,10 +954,9 @@ 'fdg': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], - COMMON_OPTIONS['knn_graph'][0], # --use-graph COMMON_OPTIONS['random_state'], - COMMON_OPTIONS['key_added'], COMMON_OPTIONS['export_embedding'], + COMMON_OPTIONS['root'], click.option( '--init-pos', type=click.STRING, @@ -959,7 +966,7 @@ ), click.option( '--layout', - type=click.Choice(['fa', 'fr', 'grid_fr', 'kk', 'lgl', 'drl', 'rt']), + type=click.Choice(['fa', 'fr', 'grid_fr', 'kk', 'lgl', 'drl', 'rt', 'rt_circular']), default='fa', show_default=True, help='Name of any valid igraph layout, including "fa" (ForceAtlas2), ' @@ -968,6 +975,13 @@ '(Large Graph Layout, very fast), "drl" (Distributed Recursive Layout, ' 'pretty fast) and "rt" (Reingold Tilford tree layout).', ), + click.option( + '--key-added-ext', + type=click.STRING, + default=None, + show_default=True, + help="By default, append 'layout'" + ), click.option( '--init-pos', type=click.STRING, @@ -976,6 +990,24 @@ help='How to initialize the low dimensional embedding. Can be "paga", ' 'or any valid key of `.obsm`.', ), + click.option( + '--neighbors-key', + type=click.STRING, + default=None, + show_default=True, + help='If not specified, draw_graph looks .obsp[‘connectivities’] ' + 'for connectivities (default storage place for pp.neighbors). If specified, ' + 'draw_graph looks .obsp[.uns[neighbors_key][‘connectivities_key’]] for ' + 'connectivities.' + ), + click.option( + '--obsp', + type=click.STRING, + default=None, + show_default=True, + help='Use .obsp[obsp] as adjacency. You can’t specify both obsp and ' + 'neighbors_key at the same time.' + ), ], 'louvain': [ @@ -1272,6 +1304,7 @@ help='Do not draw edges for weights below this threshold. Set to 0 to ' 'include all edges.', ), + COMMON_OPTIONS['root'], click.option( '--root', type=click.INT, diff --git a/scanpy_scripts/lib/_fdg.py b/scanpy_scripts/lib/_fdg.py index 91270f76..68bf95d9 100644 --- a/scanpy_scripts/lib/_fdg.py +++ b/scanpy_scripts/lib/_fdg.py @@ -15,7 +15,7 @@ def fdg( adata, use_graph='neighbors', layout='fa', - key_added=None, + key_added_ext=None, random_state=0, export_embedding=None, **kwargs @@ -24,29 +24,16 @@ def fdg( Wrapper function for sc.tl.draw_graph, for supporting named slot of fdg embeddings. """ - adj_mat = None - if use_graph != 'neighbors': - if use_graph not in adata.uns.keys(): - raise KeyError(f'"{use_graph}" is not a valid key of `.uns`.') - adj_mat = adata.uns[use_graph]['connectivities'] - - _backup_obsm_key(adata, f'X_draw_graph_{layout}') - sc.tl.draw_graph( adata, layout=layout, + key_added_ext=key_added_ext, random_state=random_state, - adjacency=adj_mat, **kwargs, ) - fdg_key = f'X_draw_graph_{layout}' - if key_added: - fdg_key = f'X_draw_graph_{layout}_{key_added}' - _rename_obsm_key(adata, f'X_draw_graph_{layout}', fdg_key) - else: - _delete_obsm_backup_key(adata, f'X_draw_graph_{layout}') + fdg_key = f'X_draw_graph_{key_added_ext or layout}' if export_embedding is not None: - write_embedding(adata, fdg_key, export_embedding, key_added=key_added) + write_embedding(adata, fdg_key, export_embedding, key_added=key_added_ext) return adata From 155e35685b77de7811fc769127a06f6d0be8551f Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 6 Aug 2020 14:26:22 +0100 Subject: [PATCH 06/29] Use Scanpy-native option for graph specification in umap, move associated fields to common --- scanpy-scripts-tests.bats | 2 +- scanpy_scripts/cmd_options.py | 43 ++++++++++++++++------------------- scanpy_scripts/lib/_umap.py | 4 ---- 3 files changed, 21 insertions(+), 28 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 43abf11b..966553a5 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -31,7 +31,7 @@ setup() { tsne_opt="-n 25 --use-rep X_pca --learning-rate 200 -E ${tsne_embed}" tsne_obj="${output_dir}/tsne.h5ad" umap_embed="${output_dir}/umap.tsv" - umap_opt="--use-graph neighbors_k10 --min-dist 0.75 --alpha 1 --gamma 1 -E ${umap_embed}" + umap_opt="--neighbors-key neighbors_k10 --min-dist 0.75 --alpha 1 --gamma 1 -E ${umap_embed}" umap_obj="${output_dir}/umap.h5ad" fdg_embed="${output_dir}/fdg.tsv" fdg_opt="--neighbors-key neighbors_k10 --layout fr -E ${fdg_embed}" diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index de3149a9..feab4e09 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -135,12 +135,25 @@ 'knn_graph': [ click.option( - '--use-graph', + '--neighbors-key', + type=click.STRING, + default=None, + show_default=False, + help='If not specified, look in .uns[‘neighbors’] for neighbors ' + 'settings and .obsp[‘connectivities’], .obsp[‘distances’] for connectivities and ' + 'distances respectively (default storage places for pp.neighbors). If specified, ' + 'look in .uns[neighbors_key] for neighbors settings and ' + '.obsp[.uns[neighbors_key][‘connectivities_key’]], ' + '.obsp[.uns[neighbors_key][‘distances_key’]] for connectivities and distances ' + 'respectively.' + ), + click.option( + '--obsp', type=click.STRING, - default='neighbors', + default=None, show_default=True, - help='Slot name under `.uns` that contains the KNN graph of which ' - 'sparse adjacency matrix is used for clustering.', + help='Use .obsp[obsp] as adjacency. You can’t specify both obsp and ' + 'neighbors_key at the same time.' ), click.option( '--directed/--undirected', 'directed', @@ -821,7 +834,7 @@ 'umap': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], - COMMON_OPTIONS['knn_graph'][0], # --use-graph + COMMON_OPTIONS['knn_graph'][0], # --neighbors-key COMMON_OPTIONS['random_state'], COMMON_OPTIONS['key_added'], COMMON_OPTIONS['export_embedding'], @@ -990,24 +1003,8 @@ help='How to initialize the low dimensional embedding. Can be "paga", ' 'or any valid key of `.obsm`.', ), - click.option( - '--neighbors-key', - type=click.STRING, - default=None, - show_default=True, - help='If not specified, draw_graph looks .obsp[‘connectivities’] ' - 'for connectivities (default storage place for pp.neighbors). If specified, ' - 'draw_graph looks .obsp[.uns[neighbors_key][‘connectivities_key’]] for ' - 'connectivities.' - ), - click.option( - '--obsp', - type=click.STRING, - default=None, - show_default=True, - help='Use .obsp[obsp] as adjacency. You can’t specify both obsp and ' - 'neighbors_key at the same time.' - ), + COMMON_OPTIONS['knn_graph'][0], # --neighbors-key + COMMON_OPTIONS['knn_graph'][1], # --obsp ], 'louvain': [ diff --git a/scanpy_scripts/lib/_umap.py b/scanpy_scripts/lib/_umap.py index 0131c0f5..197edf83 100644 --- a/scanpy_scripts/lib/_umap.py +++ b/scanpy_scripts/lib/_umap.py @@ -14,7 +14,6 @@ def umap( adata, - use_graph='neighbors', key_added=None, random_state=0, export_embedding=None, @@ -23,7 +22,6 @@ def umap( """ Wrapper function for sc.tl.umap, for supporting named slot of umap embeddings """ - _set_default_key(adata.uns, 'neighbors', use_graph) if not isinstance(random_state, (list, tuple)): _backup_obsm_key(adata, 'X_umap') @@ -51,10 +49,8 @@ def umap( 'iterable of the same length as `random_state`.') umap( adata, - use_graph='neighbors', key_added=umap_key, random_state=rseed, **kwargs, ) - _restore_default_key(adata.uns, 'neighbors', use_graph) return adata From a36a7e5ac47e400750ed1984543c2988514a862d Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 6 Aug 2020 15:24:21 +0100 Subject: [PATCH 07/29] Remove legacy snippet --- scanpy_scripts/lib/_fdg.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scanpy_scripts/lib/_fdg.py b/scanpy_scripts/lib/_fdg.py index 68bf95d9..6fbdb213 100644 --- a/scanpy_scripts/lib/_fdg.py +++ b/scanpy_scripts/lib/_fdg.py @@ -13,7 +13,6 @@ def fdg( adata, - use_graph='neighbors', layout='fa', key_added_ext=None, random_state=0, From 45f75c06d303c1cabbbdbd634d2909c337ac38af Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 6 Aug 2020 15:25:08 +0100 Subject: [PATCH 08/29] Switch louvain to native neighors_key/ obsp for slot selection --- scanpy-scripts-tests.bats | 3 ++- scanpy_scripts/lib/_louvain.py | 23 ++++++++++++----------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 966553a5..88c687ea 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -36,7 +36,8 @@ setup() { fdg_embed="${output_dir}/fdg.tsv" fdg_opt="--neighbors-key neighbors_k10 --layout fr -E ${fdg_embed}" fdg_obj="${output_dir}/fdg.h5ad" - louvain_opt="-r 0.5,1 --use-graph neighbors_k10 --key-added k10" + louvain_tsv="${output_dir}/louvain.tsv" + louvain_opt="-r 0.5,1 --neighbors-key neighbors_k10 --key-added k10 --export-cluster ${louvain_tsv}" louvain_obj="${output_dir}/louvain.h5ad" leiden_tsv="${output_dir}/leiden.tsv" leiden_opt="-r 0.3,0.7 --use-graph neighbors_k10 --key-added k10 -F loom --export-cluster ${leiden_tsv}" diff --git a/scanpy_scripts/lib/_louvain.py b/scanpy_scripts/lib/_louvain.py index 9d044355..3130da8c 100644 --- a/scanpy_scripts/lib/_louvain.py +++ b/scanpy_scripts/lib/_louvain.py @@ -9,7 +9,8 @@ def louvain( adata, resolution, - use_graph=None, + neighbors_key=None, + obsp=None, key_added=None, export_cluster=None, **kwargs @@ -20,11 +21,7 @@ def louvain( keys = [] if kwargs['restrict_to'] and not kwargs['restrict_to'][0]: kwargs['restrict_to'] = None - adj_mat = None - if use_graph: - if use_graph not in adata.uns: - raise KeyError(f'"{use_graph}" is not a valid key of `.uns`.') - adj_mat = adata.uns[use_graph]['connectivities'] + if not isinstance(resolution, (list, tuple)): if key_added is not None and not key_added.startswith('louvain_'): key_added = f'louvain_{key_added}' @@ -33,19 +30,22 @@ def louvain( sc.tl.louvain( adata, resolution=resolution, - adjacency=adj_mat, key_added=key_added, + neighbors_key=neighbors_key, + obsp=obsp, **kwargs ) keys.append(key_added) else: for i, res in enumerate(resolution): + res_key = str(res).replace('.', '_') + if key_added is None: - graph_key = ('_' + use_graph) if use_graph else '' - key = f'louvain{graph_key}_r{res_key}' + graph_key = ('_' + f'{neighbors_key or obsp}') if neighbors or obsp else '' + key = f'louvain{graph_key}_r{res_key}' elif not isinstance(key_added, (list, tuple)): - key = 'louvain_{key_added}_r{res_key}' + key = f'louvain_{key_added}_r{res_key}' elif len(key_added) == len(resolution): key = key_added[i] else: @@ -54,7 +54,8 @@ def louvain( keys.extend(louvain( adata, resolution=res, - use_graph=use_graph, + neighbors_key=neighbors_key, + obsp=obsp, key_added=key, **kwargs, )) From 5f051158f1a1e5008642dbf08a2727702956e2d1 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 6 Aug 2020 16:11:00 +0100 Subject: [PATCH 09/29] Switch leiden to native neighors_key/ obsp for slot selection, remove exhangeable Loom (incompatible with latest annData) and use native non-exchangeable Scanpy Loom export instead --- scanpy-scripts-tests.bats | 2 +- scanpy_scripts/cmd_options.py | 7 + scanpy_scripts/cmd_utils.py | 8 +- scanpy_scripts/exchangeable_loom.py | 368 ---------------------------- scanpy_scripts/lib/_leiden.py | 17 +- 5 files changed, 21 insertions(+), 381 deletions(-) delete mode 100644 scanpy_scripts/exchangeable_loom.py diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 88c687ea..1960e887 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -40,7 +40,7 @@ setup() { louvain_opt="-r 0.5,1 --neighbors-key neighbors_k10 --key-added k10 --export-cluster ${louvain_tsv}" louvain_obj="${output_dir}/louvain.h5ad" leiden_tsv="${output_dir}/leiden.tsv" - leiden_opt="-r 0.3,0.7 --use-graph neighbors_k10 --key-added k10 -F loom --export-cluster ${leiden_tsv}" + leiden_opt="-r 0.3,0.7 --neighbors-key neighbors_k10 --key-added k10 -F loom --loom-write-obsm-varm --export-cluster ${leiden_tsv}" leiden_obj="${output_dir}/leiden.loom" diffexp_tsv="${output_dir}/diffexp.tsv" diffexp_opt="-g leiden_k10_r0_7 --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv} -f loom" diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index feab4e09..d2daa9ff 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -48,6 +48,13 @@ show_default=True, help='Chunk size for writing output in zarr format.', ), + click.option( + '--loom-write-obsm-varm', '-b', + is_flag=True, + default=False, + show_default=True, + help='Write obsm and varm to the Loom file?', + ), click.option( '--export-mtx', '-X', type=click.Path(dir_okay=True, writable=True), diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py index df2b7ca1..81e691b8 100644 --- a/scanpy_scripts/cmd_utils.py +++ b/scanpy_scripts/cmd_utils.py @@ -5,7 +5,6 @@ import click import pandas as pd import scanpy as sc -from .exchangeable_loom import read_exchangeable_loom, write_exchangeable_loom from .cmd_options import CMD_OPTIONS from .lib._paga import plot_paga @@ -33,6 +32,7 @@ def cmd( input_format=None, output_format=None, zarr_chunk_size=None, + loom_write_obsm_varm=False, export_mtx=None, show_obj=None, **kwargs @@ -50,6 +50,7 @@ def cmd( output_obj, output_format=output_format, chunk_size=zarr_chunk_size, + write_obsm_varm=loom_write_obsm_varm, export_mtx=export_mtx, show_obj=show_obj, ) @@ -73,7 +74,7 @@ def _read_obj(input_obj, input_format='anndata', **kwargs): if input_format == 'anndata': adata = sc.read(input_obj, **kwargs) elif input_format == 'loom': - adata = read_exchangeable_loom(input_obj, **kwargs) + adata = sc.read_loom(input_obj, **kwargs) else: raise NotImplementedError( 'Unsupported input format: {}'.format(input_format)) @@ -87,12 +88,13 @@ def _write_obj( chunk_size=None, export_mtx=None, show_obj=None, + write_obsm_varm=False, **kwargs ): if output_format == 'anndata': adata.write(output_obj, compression='gzip') elif output_format == 'loom': - write_exchangeable_loom(adata, output_obj, **kwargs) + adata.write_loom(output_obj, write_obsm_varm=write_obsm_varm ) elif output_format == 'zarr': adata.write_zarr(output_obj, chunk_size=chunk_size, **kwargs) else: diff --git a/scanpy_scripts/exchangeable_loom.py b/scanpy_scripts/exchangeable_loom.py deleted file mode 100644 index 06d7af78..00000000 --- a/scanpy_scripts/exchangeable_loom.py +++ /dev/null @@ -1,368 +0,0 @@ -#!/usr/bin/env python -"""exchangeable_loom - -The exchangeable Loom format is based on Loom spec 3.0.0 and is defined -by the following minimum structure: - - /.attrs['LOOM_SPEC_VERSION'] = '3.0.0' - /attr - /attr/manifest = table(columns=['loom_path', 'dtype', ...]) - /matrix - /layers - /col_attrs - /col_graphs - /row_attrs - /row_graphs - -For conversion between AnnData and exchangeable Loom, anndata.obsm, anndata.varm -and non-scalar value of anndata.uns are stored as datasets under /attr, while -scalar value of anndata.uns are store at global attributes at /.attrs to make it -backward-compatible with Loom 2.0.1 and AnnData. - -* Provides - - Read and write functions for conversion between AnnData and exchangeable - Loom. -""" - -import logging -import anndata -import h5py -import numpy as np -import pandas as pd -import scipy.sparse as sp -from packaging import version - - -EXCHANGEABLE_LOOM_VERSION = '3.0.0' - - -def _h5_read_attrs(node, name): - data = node.attrs[name] - if isinstance(data, np.ndarray) and len(data) == 1: - data = data[0] - return data - - -def _h5_read_coo_matrix(node): - shape = tuple(map(int, node.attrs['shape'].decode().split(','))) - return sp.coo_matrix((node['w'], (node['a'], node['b'])), shape=shape) - - -def _h5_write_coo_matrix(root, path, graph): - if path not in root: - root.create_group(path) - graph_node = root[path] - if not isinstance(graph_node, h5py.Group): - raise ValueError(path) - row_path = '{}/{}'.format(path, 'a') - col_path = '{}/{}'.format(path, 'b') - data_path = '{}/{}'.format(path, 'w') - if row_path in root: - del root[row_path] - if col_path in root: - del root[col_path] - if data_path in root: - del root[data_path] - root.create_dataset(row_path, data=graph.row) - root.create_dataset(col_path, data=graph.col) - root.create_dataset(data_path, data=graph.data) - graph_node.attrs['shape'] = (','.join(map(str, graph.shape))).encode() - return 0 - - -def _h5_read_csr_matrix(node): - shape = tuple(map(int, node.attrs['shape'].decode().split(','))) - return sp.csr_matrix((node['data'], node['indices'], node['indptr']), shape=shape) - - -def _h5_write_csr_matrix(root, path, matrix): - if path not in root: - root.create_group(path) - graph_node = root[path] - if not isinstance(graph_node, h5py.Group): - raise ValueError(path) - data_path = '{}/{}'.format(path, 'data') - indices_path = '{}/{}'.format(path, 'indices') - indptr_path = '{}/{}'.format(path, 'indptr') - if data_path in root: - del root[data_path] - if indices_path in root: - del root[indices_path] - if indptr_path in root: - del root[indptr_path] - root.create_dataset(data_path, data=matrix.data) - root.create_dataset(indices_path, data=matrix.indices) - root.create_dataset(indptr_path, data=matrix.indptr) - graph_node.attrs['shape'] = (','.join(map(str, matrix.shape))).encode() - return 0 - - -def _h5_write_recursive_dictionary( - data, - path, - record, - attr_root=None, - dataset_root=None, - graph_root=None, -): - if isinstance(data, dict): - for child in list(data.keys()): - _h5_write_recursive_dictionary( - data[child], - '{}__{}'.format(path, child), - record, - attr_root=attr_root, - dataset_root=dataset_root, - graph_root=graph_root, - ) - elif isinstance(data, (bytes, str, int, float, np.number, np.character)): - dtype = type(data) - if attr_root: - if isinstance(data, str): - data = data.encode() - attr_root.attrs[path] = data - record.append('{}.attrs[{}]::scalar'.format(attr_root.name, path)) - else: - logging.warning('Ignoring attr {} ({})'.format(path, dtype)) - elif isinstance(data, (tuple, list, np.ndarray)): - dtype = type(data) - if dataset_root: - if not isinstance(data, np.ndarray): - data = np.array(data) - # Convert to bytes if unicode string - if data.dtype.kind == 'U': - data = data.astype(np.character) - if path in dataset_root: - del dataset_root[path] - dataset_root.create_dataset(path, data=data) - record.append('{}/{}::array'.format(dataset_root.name, path)) - else: - logging.warning('Ignoring dataset {} ({})'.format(path, dtype)) - elif isinstance(data, sp.csr_matrix): - dtype = type(data) - if graph_root: - _h5_write_coo_matrix(graph_root, path, sp.coo_matrix(data)) - record.append('{}/{}::graph'.format(graph_root.name, path)) - else: - logging.warning('Ignoring graph {} ({})'.format(path, dtype)) - else: - logging.warning('Ignoring {} ({})'.format(path, dtype)) - - -def _is_exchangeable_loom(filename): - with h5py.File(filename, mode='r') as lm: - try: - loom_version = _h5_read_attrs(lm, 'LOOM_SPEC_VERSION').decode() - except Exception: - loom_version = '2.0.0' - return version.parse(loom_version) >= version.parse(EXCHANGEABLE_LOOM_VERSION) - - -def _read_manifest(h5file): - return pd.DataFrame( - np.array(h5file['attr']['manifest']).astype(str), - columns=['loom_path', 'dtype', 'anndata_path', 'sce_path'], - ) - - -def read_exchangeable_loom(filename, sparse=True): - """Read exchangeable Loom - - * Parameters - + filename : str - Path of the input exchangeable Loom file - - * Returns - + adata : AnnData - An AnnData object - """ - # Use anndata to read matrix, obs and var - adata = anndata.read_loom(filename, sparse=sparse, validate=False) - - if not _is_exchangeable_loom(filename): - return adata - - # Use h5py to read the rest - with h5py.File(filename, mode='r') as lm: - manifest = _read_manifest(lm) - for i, row in manifest.iterrows(): - anndata_path = row['anndata_path'] - loom_path = row['loom_path'] - dtype = row['dtype'] - if anndata_path and loom_path: - # Get data from loom_path - if loom_path.startswith('/.attrs['): - lm_path = loom_path[8:-1] # remove '/.attrs[' - data = _h5_read_attrs(lm, lm_path) - else: - data = lm[loom_path] - # Type conversion according to dtype - if dtype == 'array': - data = np.array(data) - # Convert to unicode string if bytes - if data.dtype.kind == 'S': - data = data.astype(str) - elif dtype == 'graph': - data = sp.csr_matrix(_h5_read_coo_matrix(data)) - elif dtype == 'scalar': - if isinstance(data, (h5py.Dataset, np.ndarray)): - data = data[0] - if isinstance(data, bytes): - data = data.decode() - elif dtype == 'csr_matrix': - data = _h5_read_csr_matrix(data) - elif dtype == 'df': - data = pd.DataFrame(data) - - # Put data to the right location according to anndata_path - if anndata_path.startswith('/uns'): - # For .uns, write recursive dictionary as necessary - attr = adata.uns - paths = anndata_path[5:].split('/') - n_path = len(paths) - for i, path in enumerate(paths): - if i == n_path - 1: - attr[path] = data - else: - if path not in attr: - attr[path] = {} - attr = attr[path] - elif anndata_path.startswith('/obsm'): - attr = anndata_path[6:] - adata.obsm[attr] = data - elif anndata_path.startswith('/varm'): - attr = anndata_path[6:] - adata.varm[attr] = data - elif anndata_path == '/raw.X': - if adata.raw is None: - adata.raw = anndata.AnnData(X=data) - else: - adata.raw.X = data - elif anndata_path == '/raw.var': - adata.raw.var.index = data - else: - logging.warning('Unexpected anndata path: {}'.format(anndata_path)) - return adata - - -def write_exchangeable_loom(adata, filename, col_graphs=['neighbors']): - """Write an AnnData object to an exchangeable Loom - - * Parameters - + adata : AnnData - An AnnData object - + filename : str - Path of the output exchangeable Loom file - """ - adata.write_loom(filename) - manifest = {'loom': [], 'dtype': [], 'anndata': [], 'sce': []} - with h5py.File(filename, mode='r+') as lm: - # Write modified LOOM_SPEC_VERSION - lm.attrs['LOOM_SPEC_VERSION'] = EXCHANGEABLE_LOOM_VERSION.encode() - - # Write creation/modification info - if 'created_from' not in lm.attrs: - lm.attrs['created_from'] = 'anndata' - lm.attrs['last_modified_by'] = 'scanpy' - - # Record which columns are used as colnames/rownames - lm['col_attrs'].attrs['CellID'] = 'obs_names' - lm['row_attrs'].attrs['Gene'] = 'var_names' - - # Create necessary groups - lm.create_group('/attr') - - # Write /obsm - for k in adata.obsm.keys(): - arr = adata.obsm[k] - arr_name = k.replace('X_', '').upper() - # Derive paths - anndata_path = '/obsm/{}'.format(k) - loom_path = '/attr/reducedDims__{}'.format(arr_name) - sce_path = '@reducedDims@listData${}'.format(arr_name) - # Record mapping - manifest['loom'].append(loom_path) - manifest['dtype'].append('array') - manifest['anndata'].append(anndata_path) - manifest['sce'].append(sce_path) - # Write content to loom - lm.create_dataset(loom_path, data=arr) - - # Write /varm - for k in adata.varm.keys(): - arr = adata.varm[k] - # Derive paths - anndata_path = '/varm/{}'.format(k) - loom_path = '/attr/varm__{}'.format(k) - # Record mapping - manifest['loom'].append(loom_path) - manifest['dtype'].append('array') - manifest['anndata'].append(anndata_path) - manifest['sce'].append('') - # Write content to loom - lm.create_dataset(loom_path, data=arr) - - # Write /uns - uns_entries = [] - for k in adata.uns.keys(): - item = adata.uns[k] - # Special handling of 'neighbors' as it contains graphs - if k in col_graphs: - # Write content to loom while recording loom path and dtype - _h5_write_recursive_dictionary( - item, - k, - uns_entries, - attr_root=lm, - dataset_root=lm['/attr'], - graph_root=lm['/col_graphs'], - ) - else: - # Write content to loom while recording loom path and dtype - _h5_write_recursive_dictionary( - item, - k, - uns_entries, - attr_root=lm, - dataset_root=lm['/attr'], - graph_root=lm['/attr'], - ) - # Derive paths - for entry in uns_entries: - loom_path, _, dtype = entry.partition('::') - if loom_path.startswith('/col_graphs/'): - path = loom_path[12:] - sce_path = '@colGraphs${}'.format(path) - else: - if loom_path.startswith('/.attrs['): - path = loom_path[8:-1] # remove '/.attrs[' - elif loom_path.startswith('/attr/'): - path = loom_path[6:] # remove '/attr/' - else: - logging.warning('unexpected path: {}'.format(loom_path)) - sce_path = '@metadata${}'.format(path.replace('__', '$')) - anndata_path = '/uns/{}'.format(path.replace('__', '/')) - # Record mapping - manifest['loom'].append(loom_path) - manifest['dtype'].append(dtype) - manifest['anndata'].append(anndata_path) - manifest['sce'].append(sce_path) - - # Write /raw - if adata.raw is not None: - _h5_write_csr_matrix(lm['/attr'], 'raw.X', adata.raw.X) - manifest['loom'].append('/attr/raw.X') - manifest['dtype'].append('csr_matrix') - manifest['anndata'].append('/raw.X') - manifest['sce'].append('@metadata$raw.X') - lm['/attr'].create_dataset( - 'raw.var', data=adata.raw.var.index.values.astype(bytes)) - manifest['loom'].append('/attr/raw.var') - manifest['dtype'].append('array') - manifest['anndata'].append('/raw.var') - manifest['sce'].append('@metadata$raw.var') - - # Write mapping - lm['/attr'].create_dataset( - 'manifest', data=pd.DataFrame(manifest).values.astype(np.character)) diff --git a/scanpy_scripts/lib/_leiden.py b/scanpy_scripts/lib/_leiden.py index 2d6844c1..8ca18eb4 100644 --- a/scanpy_scripts/lib/_leiden.py +++ b/scanpy_scripts/lib/_leiden.py @@ -9,7 +9,8 @@ def leiden( adata, resolution, - use_graph=None, + neighbors_key=None, + obsp=None, key_added=None, export_cluster=None, **kwargs @@ -20,11 +21,7 @@ def leiden( keys = [] if kwargs.get('restrict_to', None) and not kwargs['restrict_to'][0]: kwargs['restrict_to'] = None - adj_mat = None - if use_graph: - if use_graph not in adata.uns: - raise KeyError(f'"{use_graph}" is not a valid key of `.uns`.') - adj_mat = adata.uns[use_graph]['connectivities'] + if not isinstance(resolution, (list, tuple)): if key_added is not None and not key_added.startswith('leiden_'): key_added = f'leiden_{key_added}' @@ -33,7 +30,8 @@ def leiden( sc.tl.leiden( adata, resolution=resolution, - adjacency=adj_mat, + neighbors_key=neighbors_key, + obsp=obsp, key_added=key_added, **kwargs ) @@ -42,7 +40,7 @@ def leiden( for i, res in enumerate(resolution): res_key = str(res).replace('.', '_') if key_added is None: - graph_key = ('_' + use_graph) if use_graph else '' + graph_key = ('_' + f'{neighbors_key or obsp}') if neighbors or obsp else '' key = f'leiden{graph_key}_r{res_key}' elif not isinstance(key_added, (list, tuple)): key = f'leiden_{key_added}_r{res_key}' @@ -54,7 +52,8 @@ def leiden( keys.extend(leiden( adata, resolution=res, - use_graph=use_graph, + neighbors_key=neighbors_key, + obsp=obsp, key_added=key, **kwargs, )) From feabc73344001fdec2d2f54af9f60d15384ee894 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 6 Aug 2020 16:55:44 +0100 Subject: [PATCH 10/29] Add layer specification to rank_genes_groups call --- scanpy_scripts/cmd_options.py | 7 +++++++ scanpy_scripts/lib/_diffexp.py | 18 ++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index d2daa9ff..91d2e4b9 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1072,6 +1072,13 @@ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], COMMON_OPTIONS['use_raw'], + COMMON_OPTIONS['key_added'], + click.option( + '--layer', '-l', + type=click.STRING, + default=None, + help="Key from adata.layers whose value will be used to perform tests on." + ), click.option( '--groupby', '-g', type=click.STRING, diff --git a/scanpy_scripts/lib/_diffexp.py b/scanpy_scripts/lib/_diffexp.py index f5f88d2c..5f2518bd 100644 --- a/scanpy_scripts/lib/_diffexp.py +++ b/scanpy_scripts/lib/_diffexp.py @@ -10,7 +10,8 @@ def diffexp( adata, use_raw=True, n_genes=None, - key_added=None, + key_added='rank_genes_groups', + layer=None, logreg_param=None, filter_params=None, save=None, @@ -29,23 +30,24 @@ def diffexp( for key, val in logreg_param: kwargs[key] = val - sc.tl.rank_genes_groups( - adata, use_raw=use_raw, n_genes=n_genes, key_added=key_added, **kwargs) - key_added = key_added if key_added else 'rank_genes_groups' + diff_key = (key_added + f'_{layer}') if layer else key_added + + sc.tl.rank_genes_groups( + adata, use_raw=use_raw, n_genes=n_genes, key_added=diff_key, **kwargs) - de_tbl = extract_de_table(adata.uns[key_added]) + de_tbl = extract_de_table(adata.uns[diff_key]) if isinstance(filter_params, dict): sc.tl.filter_rank_genes_groups( adata, - key=key_added, - key_added=key_added + '_filtered', + key=diff_key, + key_added=diff_key + '_filtered', use_raw=use_raw, **filter_params, ) - de_tbl = extract_de_table(adata.uns[key_added + '_filtered']) + de_tbl = extract_de_table(adata.uns[diff_key + '_filtered']) de_tbl = de_tbl.loc[de_tbl.genes.astype(str) != 'nan', :] if save: From 8a34634fcc919441d917797dff93d3ad8ff0029a Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 6 Aug 2020 17:15:13 +0100 Subject: [PATCH 11/29] Updata paga call to allow RNA velocity, use Scanpy-native graph slot specification --- scanpy-scripts-tests.bats | 4 ++-- scanpy_scripts/cmd_options.py | 12 +++++++++++- scanpy_scripts/lib/_paga.py | 6 ------ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 1960e887..e20f4fb5 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -45,7 +45,7 @@ setup() { diffexp_tsv="${output_dir}/diffexp.tsv" diffexp_opt="-g leiden_k10_r0_7 --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv} -f loom" diffexp_obj="${output_dir}/diffexp.h5ad" - paga_opt="--use-graph neighbors_k10 --key-added k10_r0_7 --groups leiden_k10_r0_7 --model v1.2 -f loom" + paga_opt="--neighbors-key neighbors_k10 --key-added louvain_k10_r0_5 --groups louvain_k10_r0_5 --model v1.2" paga_obj="${output_dir}/paga.h5ad" diffmap_embed="${output_dir}/diffmap.tsv" diffmap_opt="--use-graph neighbors_k10 --n-comps 10 -E ${diffmap_embed}" @@ -279,7 +279,7 @@ setup() { skip "$paga_obj exists and resume is set to 'true'" fi - run rm -f $paga_obj && eval "$scanpy paga $paga_opt $leiden_obj $paga_obj" + run rm -f $paga_obj && eval "$scanpy paga $paga_opt $louvain_obj $paga_obj" [ "$status" -eq 0 ] [ -f "$paga_obj" ] diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 91d2e4b9..1247bcbe 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1166,7 +1166,7 @@ 'paga': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], - COMMON_OPTIONS['knn_graph'][0], # --use-graph + COMMON_OPTIONS['knn_graph'][0], # --neighbors-key COMMON_OPTIONS['key_added'], click.option( '--groups', @@ -1182,6 +1182,16 @@ show_default=True, help='The PAGA connectivity model.', ), + click.option( + '--use-rna-velocity', + is_flag=True, + default=False, + show_default=True, + help='Use RNA velocity to orient edges in the abstracted graph and ' + 'estimate transitions. Requires that adata.uns contains a directed single-cell ' + 'graph with key velocity_graph. This feature might be subject to change in the ' + 'future.', + ), ], 'diffmap': [ diff --git a/scanpy_scripts/lib/_paga.py b/scanpy_scripts/lib/_paga.py index 6e49064b..6d320ce0 100644 --- a/scanpy_scripts/lib/_paga.py +++ b/scanpy_scripts/lib/_paga.py @@ -15,20 +15,14 @@ def paga( adata, - use_graph='neighbors', key_added=None, **kwargs, ): """ Wrapper function for sc.tl.paga, for supporting named slot """ - _set_default_key(adata.uns, 'neighbors', use_graph) - _backup_default_key(adata.uns, 'paga') - sc.tl.paga(adata, **kwargs) - _restore_default_key(adata.uns, 'neighbors', use_graph) - if key_added: paga_key = f'paga_{key_added}' _rename_default_key(adata.uns, 'paga', paga_key) From a161abf071e371d9fc46b6e628caf1e7889e7094 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 7 Aug 2020 12:09:30 +0100 Subject: [PATCH 12/29] Set diffmap, dpt to use neighbors_key, add other missing option. --- scanpy-scripts-tests.bats | 4 ++-- scanpy_scripts/cmd_options.py | 13 +++++++++++-- scanpy_scripts/lib/_diffmap.py | 14 +------------- scanpy_scripts/lib/_dpt.py | 21 ++------------------- 4 files changed, 16 insertions(+), 36 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index e20f4fb5..c3fea05b 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -48,9 +48,9 @@ setup() { paga_opt="--neighbors-key neighbors_k10 --key-added louvain_k10_r0_5 --groups louvain_k10_r0_5 --model v1.2" paga_obj="${output_dir}/paga.h5ad" diffmap_embed="${output_dir}/diffmap.tsv" - diffmap_opt="--use-graph neighbors_k10 --n-comps 10 -E ${diffmap_embed}" + diffmap_opt="--neighbors-key neighbors_k10 --n-comps 10 -E ${diffmap_embed}" diffmap_obj="${output_dir}/diffmap.h5ad" - dpt_opt="--use-graph neighbors_k10 --key-added k10 --n-dcs 10 --root leiden_k10_r0_7 0" + dpt_opt="--neighbors-key neighbors_k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root louvain_k10_r0_5 0" dpt_obj="${output_dir}/dpt.h5ad" plt_embed_opt="--color leiden_k10_r0_7 -f loom --title test" plt_embed_pdf="${output_dir}/umap_leiden_k10_r0_7.pdf" diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 1247bcbe..7ea21508 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1197,7 +1197,7 @@ 'diffmap': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], - COMMON_OPTIONS['knn_graph'][0], # --use-graph + COMMON_OPTIONS['knn_graph'][0], # --neighbors-key COMMON_OPTIONS['key_added'], COMMON_OPTIONS['export_embedding'], COMMON_OPTIONS['n_comps'], @@ -1206,7 +1206,7 @@ 'dpt': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], - COMMON_OPTIONS['knn_graph'][0], # --use-graph + COMMON_OPTIONS['knn_graph'][0], # --neighbors-key COMMON_OPTIONS['key_added'], click.option( '--root', @@ -1239,6 +1239,15 @@ 'do not consider branches/groups that contain fewer than this fraction ' 'of the total number of data points.', ), + click.option( + '--disallow-kendall-tau-shift', 'allow_kendall_tau_shift', + is_flag=True, + default=True, + show_default=True, + help='By default: If a very small branch is detected upon ' + 'splitting, shift away from maximum correlation in Kendall tau criterion of ' + '[Haghverdi16] to stabilize the splitting. Use flag to disable this.' + ), ], 'embed': [ diff --git a/scanpy_scripts/lib/_diffmap.py b/scanpy_scripts/lib/_diffmap.py index d52333fc..e3687058 100644 --- a/scanpy_scripts/lib/_diffmap.py +++ b/scanpy_scripts/lib/_diffmap.py @@ -4,10 +4,6 @@ import scanpy as sc from ..obj_utils import ( - _set_default_key, - _restore_default_key, - _backup_obsm_key, - _delete_obsm_backup_key, _rename_obsm_key, write_embedding, ) @@ -15,7 +11,6 @@ def diffmap( adata, - use_graph='neighbors', key_added=None, export_embedding=None, **kwargs, @@ -23,19 +18,12 @@ def diffmap( """ Wrapper function for sc.tl.diffmap, for supporting named slot """ - _set_default_key(adata.uns, 'neighbors', use_graph) - _backup_obsm_key(adata, 'X_diffmap') - sc.tl.diffmap(adata, **kwargs) - - _restore_default_key(adata.uns, 'neighbors', use_graph) - + diffmap_key = 'X_diffmap' if key_added: diffmap_key = f'{diffmap_key}_{key_added}' _rename_obsm_key(adata, 'X_diffmap', diffmap_key) - else: - _delete_obsm_backup_key(adata, 'X_diffmap') if export_embedding is not None: write_embedding(adata, diffmap_key, export_embedding, key_added=key_added) diff --git a/scanpy_scripts/lib/_dpt.py b/scanpy_scripts/lib/_dpt.py index a5414bc8..98956d19 100644 --- a/scanpy_scripts/lib/_dpt.py +++ b/scanpy_scripts/lib/_dpt.py @@ -5,26 +5,19 @@ import numpy as np import scanpy as sc from ..obj_utils import ( - _set_default_key, - _restore_default_key, - _backup_default_key, - _delete_backup_key, _rename_default_key, - _set_obsm_key, - _restore_obsm_key, ) def dpt( adata, root=None, - use_graph='neighbors', use_diffmap='X_diffmap', key_added=None, **kwargs, ): """ - Wrapper function for sc.tl.dpt, for support named slot + Wrapper function for sc.tl.dpt """ if root is None or not (isinstance(root, (list, tuple)) and len(root) == 2): root = (None, None) @@ -35,19 +28,9 @@ def dpt( adata.uns['iroot'] = np.random.choice( np.flatnonzero(adata.obs[root[0]] == root[1])) - _set_default_key(adata.uns, 'neighbors', use_graph) - _set_obsm_key(adata, 'X_diffmap', use_diffmap) - - _backup_default_key(adata.obs, 'dpt_pseudotime') - sc.tl.dpt(adata, **kwargs) - - _restore_default_key(adata.uns, 'neighbors', use_graph) - _restore_obsm_key(adata, 'X_diffmap', use_diffmap) - if key_added: dpt_key = f'dpt_pseudotime_{key_added}' _rename_default_key(adata.obs, 'dpt_pseudotime', dpt_key) - else: - _delete_backup_key(adata.obs, 'dpt_pseudotime') + return adata From 129483b35f2ed82d17b1b6f043887b3bc720e71a Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 10 Aug 2020 15:51:24 +0100 Subject: [PATCH 13/29] Fixes for embeddings, plot_paga to use new sc.pl.embeddings. --- scanpy-scripts-tests.bats | 10 ++-- scanpy_scripts/cmd_options.py | 99 +++++++++++++++++++++++++++++------ scanpy_scripts/cmd_utils.py | 3 +- scanpy_scripts/cmds.py | 2 +- scanpy_scripts/lib/_paga.py | 2 +- 5 files changed, 91 insertions(+), 25 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index c3fea05b..60fb1d16 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -52,9 +52,9 @@ setup() { diffmap_obj="${output_dir}/diffmap.h5ad" dpt_opt="--neighbors-key neighbors_k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root louvain_k10_r0_5 0" dpt_obj="${output_dir}/dpt.h5ad" - plt_embed_opt="--color leiden_k10_r0_7 -f loom --title test" - plt_embed_pdf="${output_dir}/umap_leiden_k10_r0_7.pdf" - plt_paga_opt="--use-key paga_k10_r0_7 --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff" + plt_embed_opt="--projection 2d --color louvain_k10_r0_5 --title test" + plt_embed_pdf="${output_dir}/umap_louvain_k10_r0_5.pdf" + plt_paga_opt="--use-key paga_louvain_k10_r0_5 --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff" plt_paga_pdf="${output_dir}/paga_k10_r0_7.pdf" test_clustering='leiden_k10_r0_3' test_markers='LDHB,CD3D,CD3E' @@ -318,10 +318,10 @@ setup() { skip "$plt_embed_pdf exists and resume is set to 'true'" fi - run rm -f $plt_embed_pdf && eval "$scanpy plot embed $plt_embed_opt $leiden_obj $plt_embed_pdf" + run rm -f $plt_embed_pdf && eval "$scanpy plot embed $plt_embed_opt $louvain_obj $plt_embed_pdf" [ "$status" -eq 0 ] - [ -f "$dpt_obj" ] + [ -f "$plt_embed_pdf" ] } # Run Plot paga diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 7ea21508..36e8276e 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -177,6 +177,18 @@ ), ], + + 'layer':click.option( + '--layer', + type=CommaSeparatedText(simplify=True), + default=None, + show_default=True, + help='Name of the AnnData object layer that wants to be plotted. By ' + 'default adata.raw.X is plotted. If use_raw=False is set, then adata.X ' + 'is plotted. If layer is set to a valid layer name, then the layer is ' + 'plotted. layer takes precedence over use_raw.', + ), + 'n_comps': click.option( '--n-comps', type=click.INT, @@ -310,16 +322,6 @@ 'computed using scanpy.tl.dendrogram(). If tl.dendrogram has not been ' 'called previously the function is called with default parameters.', ), - click.option( - '--layer', - type=CommaSeparatedText(simplify=True), - default=None, - show_default=True, - help='Name of the AnnData object layer that wants to be plotted. By ' - 'default adata.raw.X is plotted. If use_raw=False is set, then adata.X ' - 'is plotted. If layer is set to a valid layer name, then the layer is ' - 'plotted. layer takes precedence over use_raw.', - ), click.option( '--standard-scale', type=click.Choice(['var', 'obs']), @@ -485,6 +487,23 @@ show_default=True, help='If choosing a tree layout, this is the index of the root node.', ), + + 'plot_embed': [ + click.option( + '--use-raw/--no-raw', + default=None, + show_default=True, + help='Use `.raw` attribute for coloring with gene expression. If ' + '`None`, uses `.raw` if present.', + ), + click.option( + '--groups', + type=click.STRING, + default=None, + help='Key for categorical in `.obs`. You can pass your predefined ' + 'groups by choosing any categorical annotation of observations.', + ), + ] } CMD_OPTIONS = { @@ -1254,6 +1273,7 @@ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['plot'], *COMMON_OPTIONS['frame_title'], + COMMON_OPTIONS['layer'], click.option( '--basis', type=click.STRING, @@ -1269,13 +1289,6 @@ show_default=True, help='Keys for annotations of observations/cells or variables/genes.', ), - click.option( - '--use-raw/--no-raw', - default=None, - show_default=True, - help='Use `.raw` attribute for coloring with gene expression. If ' - '`None`, uses `.raw` if present.', - ), click.option( '--legend-loc', type=click.Choice(['right margin', 'on data']), @@ -1299,12 +1312,60 @@ help='Point size. Automatically computed if not specified.', ), COMMON_OPTIONS['gene_symbols'], + click.option( + '--edges', + is_flag=True, + default=False, + show_default=True, + help='Show edges.', + ), + click.option( + '--edges-width', + type=click.FLOAT, + default=0.1, + show_default=True, + help='Width of edges.', + ), + click.option( + '--edges-color', + type=click.STRING, + default=None, + show_default=True, + help='Color of edges. See draw_networkx_edges().', + ), + COMMON_OPTIONS['knn_graph'][0], # --neighbors-key + click.option( + '--no-sort-order', 'sort_order', + is_flag=True, + default=True, + show_default=True, + help='Disable default behaviour: for continuous annotations used as ' + 'color parameter, plot data points with higher values on top of others.', + ), + *COMMON_OPTIONS['plot_embed'], + click.option( + '--components', + type=click.STRING, + default=None, + show_default=True, + help="For instance, ['1,2', '2,3']. To plot all available components use 'all'.", + ), + click.option( + '--projection', + type=click.Choice(['2d', '3d']), + default='2d', + show_default=True, + help="Projection of plot." + ), + ], 'plot_paga': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['plot'], *COMMON_OPTIONS['frame_title'], + *COMMON_OPTIONS['plot_embed'], + COMMON_OPTIONS['random_state'], click.option( '--use-key', type=click.STRING, @@ -1431,6 +1492,7 @@ COMMON_OPTIONS['use_raw'], COMMON_OPTIONS['var_names'], *COMMON_OPTIONS['rank_genes_groups_plots'], + COMMON_OPTIONS['layer'], *COMMON_OPTIONS['diffexp_plot'], COMMON_OPTIONS['gene_symbols'], *COMMON_OPTIONS['sviol'], @@ -1443,6 +1505,7 @@ COMMON_OPTIONS['use_raw'], COMMON_OPTIONS['var_names'], *COMMON_OPTIONS['rank_genes_groups_plots'], + COMMON_OPTIONS['layer'], *COMMON_OPTIONS['diffexp_plot'], COMMON_OPTIONS['gene_symbols'], *COMMON_OPTIONS['dot'], @@ -1454,6 +1517,7 @@ COMMON_OPTIONS['use_raw'], COMMON_OPTIONS['var_names'], *COMMON_OPTIONS['rank_genes_groups_plots'], + COMMON_OPTIONS['layer'], *COMMON_OPTIONS['diffexp_plot'], COMMON_OPTIONS['gene_symbols'], ], @@ -1464,6 +1528,7 @@ COMMON_OPTIONS['use_raw'], COMMON_OPTIONS['var_names'], *COMMON_OPTIONS['rank_genes_groups_plots'], + COMMON_OPTIONS['layer'], *COMMON_OPTIONS['diffexp_plot'], COMMON_OPTIONS['gene_symbols'], *COMMON_OPTIONS['heat'], diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py index 81e691b8..8f7a5665 100644 --- a/scanpy_scripts/cmd_utils.py +++ b/scanpy_scripts/cmd_utils.py @@ -161,6 +161,7 @@ def make_plot_function(func_name, kind=None): # Provide a function translation plot_funcs = { + 'embedding': sc.pl.embedding, 'scatter': sc.pl.scatter, 'sviol': sc.pl.stacked_violin, 'rgg_sviol': sc.pl.rank_genes_groups_stacked_violin, @@ -231,7 +232,7 @@ def plot_function( if output_fig: prefix='' - if func_name == 'scatter': + if func_name == 'scatter' or func_name == 'embedding': prefix = kwargs.get('basis', func.__name__) elif kind: prefix = kind diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py index 50887223..e697e69c 100644 --- a/scanpy_scripts/cmds.py +++ b/scanpy_scripts/cmds.py @@ -170,7 +170,7 @@ PLOT_EMBED_CMD = make_subcmd( 'embed', - make_plot_function('scatter'), + make_plot_function('embedding'), cmd_desc='Plot cell embeddings.', arg_desc=_IP_DESC, ) diff --git a/scanpy_scripts/lib/_paga.py b/scanpy_scripts/lib/_paga.py index 6d320ce0..58bb84e9 100644 --- a/scanpy_scripts/lib/_paga.py +++ b/scanpy_scripts/lib/_paga.py @@ -49,7 +49,7 @@ def plot_paga( """Make PAGA plot """ if basis is not None and f'X_{basis}' in adata.obsm.keys(): - ax = sc.plotting._tools.scatterplots.plot_scatter( + ax = sc.pl.embedding( adata, basis=basis, color=color, From 1d2ee553124aa16c9963c34ba34fd029cc82ffd5 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 10 Aug 2020 16:27:14 +0100 Subject: [PATCH 14/29] Fix plotting test params --- scanpy-scripts-tests.bats | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 60fb1d16..3f19bef0 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -42,21 +42,21 @@ setup() { leiden_tsv="${output_dir}/leiden.tsv" leiden_opt="-r 0.3,0.7 --neighbors-key neighbors_k10 --key-added k10 -F loom --loom-write-obsm-varm --export-cluster ${leiden_tsv}" leiden_obj="${output_dir}/leiden.loom" + test_clustering='louvain_k10_r0_5' diffexp_tsv="${output_dir}/diffexp.tsv" - diffexp_opt="-g leiden_k10_r0_7 --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv} -f loom" + diffexp_opt="-g ${test_clustering} --reference rest --filter-params min_in_group_fraction:0.25,min_fold_change:1.5 --save ${diffexp_tsv}" diffexp_obj="${output_dir}/diffexp.h5ad" - paga_opt="--neighbors-key neighbors_k10 --key-added louvain_k10_r0_5 --groups louvain_k10_r0_5 --model v1.2" + paga_opt="--neighbors-key neighbors_k10 --key-added ${test_clustering} --groups ${test_clustering} --model v1.2" paga_obj="${output_dir}/paga.h5ad" diffmap_embed="${output_dir}/diffmap.tsv" diffmap_opt="--neighbors-key neighbors_k10 --n-comps 10 -E ${diffmap_embed}" diffmap_obj="${output_dir}/diffmap.h5ad" - dpt_opt="--neighbors-key neighbors_k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root louvain_k10_r0_5 0" + dpt_opt="--neighbors-key neighbors_k10 --key-added k10 --n-dcs 10 --disallow-kendall-tau-shift --root ${test_clustering} 0" dpt_obj="${output_dir}/dpt.h5ad" - plt_embed_opt="--projection 2d --color louvain_k10_r0_5 --title test" - plt_embed_pdf="${output_dir}/umap_louvain_k10_r0_5.pdf" - plt_paga_opt="--use-key paga_louvain_k10_r0_5 --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff" + plt_embed_opt="--projection 2d --color ${test_clustering} --title test" + plt_embed_pdf="${output_dir}/umap_${test_clustering}.pdf" + plt_paga_opt="--use-key paga_${test_clustering} --node-size-scale 2 --edge-width-scale 0.5 --basis diffmap --color dpt_pseudotime_k10 --frameoff" plt_paga_pdf="${output_dir}/paga_k10_r0_7.pdf" - test_clustering='leiden_k10_r0_3' test_markers='LDHB,CD3D,CD3E' diffexp_plot_opt="--var-names $test_markers --use-raw --dendrogram --groupby ${test_clustering}" plt_stacked_violin_opt="${diffexp_plot_opt} --no-jitter --swap-axes" @@ -64,7 +64,7 @@ setup() { plt_dotplot_pdf="${output_dir}/dot_${test_clustering}_LDHB_CD3D_CD3E.pdf" plt_matrixplot_pdf="${output_dir}/matrix_${test_clustering}_LDHB_CD3D_CD3E.pdf" plt_heatmap_pdf="${output_dir}/heatmap_${test_clustering}_LDHB_CD3D_CD3E.pdf" - plt_rank_genes_groups_opt="--rgg --groups 3,5" + plt_rank_genes_groups_opt="--rgg --groups 3,4" plt_rank_genes_groups_stacked_violin_pdf="${output_dir}/rggsviolin_${test_clustering}.pdf" plt_rank_genes_groups_matrix_pdf="${output_dir}/rggmatrix_${test_clustering}.pdf" plt_rank_genes_groups_dot_pdf="${output_dir}/rggdot_${test_clustering}.pdf" @@ -266,7 +266,7 @@ setup() { skip "$diffexp_obj exists and resume is set to 'true'" fi - run rm -f $diffexp_obj $diffexp_tsv && eval "$scanpy diffexp $diffexp_opt $leiden_obj $diffexp_obj" + run rm -f $diffexp_obj $diffexp_tsv && eval "$scanpy diffexp $diffexp_opt $louvain_obj $diffexp_obj" [ "$status" -eq 0 ] [ -f "$diffexp_obj" ] && [ -f "$diffexp_tsv" ] From 4b738de6db2e01b3696fb638242fca33de23bbf7 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Mon, 10 Aug 2020 17:43:31 +0100 Subject: [PATCH 15/29] Some plot files now have underscores --- scanpy_scripts/cmd_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py index 8f7a5665..a2449e07 100644 --- a/scanpy_scripts/cmd_utils.py +++ b/scanpy_scripts/cmd_utils.py @@ -235,9 +235,11 @@ def plot_function( if func_name == 'scatter' or func_name == 'embedding': prefix = kwargs.get('basis', func.__name__) elif kind: - prefix = kind + prefix = kind elif func_name in plot_funcs: prefix = plot_funcs[ func_name ].__name__.split('.')[-1] + if func_name in [ 'sviol', 'rgg_sviol', 'dot', 'rgg_dot', 'matrix', 'rgg_matrix' ]: + prefix = prefix + '_' os.rename( os.path.join(sc.settings.figdir, prefix + figname), output_fig) From 85826e0078ed92b7884aeb3b77f68000230010b1 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 11 Aug 2020 11:10:55 +0100 Subject: [PATCH 16/29] Add wrapper for harmony_integrate --- scanpy-scripts-tests.bats | 46 +++++++++++++++++++++++++++++++++++ scanpy_scripts/cli.py | 8 ++++++ scanpy_scripts/cmd_options.py | 29 ++++++++++++++++++++++ scanpy_scripts/cmds.py | 8 ++++++ 4 files changed, 91 insertions(+) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 3f19bef0..08333678 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -69,6 +69,12 @@ setup() { plt_rank_genes_groups_matrix_pdf="${output_dir}/rggmatrix_${test_clustering}.pdf" plt_rank_genes_groups_dot_pdf="${output_dir}/rggdot_${test_clustering}.pdf" plt_rank_genes_groups_heatmap_pdf="${output_dir}/rggheatmap_${test_clustering}.pdf" + harmony_integrate_obj="${output_dir}/harmony_integrate.h5ad" + harmony_integrate_opt="--batch-key ${test_clustering}" + harmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings after harmony' --basis 'X_pca_harmony'" + noharmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings before harmony' --basis 'X_pca'" + harmony_integrated_pca_pdf="${output_dir}/harmony_pca_${test_clustering}.pdf" + noharmony_integrated_pca_pdf="${output_dir}/pca_${test_clustering}.pdf" if [ ! -d "$data_dir" ]; then mkdir -p $data_dir @@ -442,6 +448,46 @@ setup() { [ -f "$plt_rank_genes_groups_matrix_pdf" ] } +# Do harmony batch correction, using clustering as batch (just for test purposes) + +@test "Run Harmony batch integration using clustering as batch" { + if [ "$resume" = 'true' ] && [ -f "$harmony_integrate_obj" ]; then + skip "$harmony_integrate_obj exists and resume is set to 'true'" + fi + + run rm -f $harmony_integrate_obj && eval "$scanpy integrate harmony_integrate $harmony_integrate_opt $louvain_obj $harmony_integrate_obj" + + [ "$status" -eq 0 ] + [ -f "$plt_rank_genes_groups_matrix_pdf" ] + +} + +# Run Plot PCA embedding before harmony + +@test "Run Plot PCA embedding before Harmony" { + if [ "$resume" = 'true' ] && [ -f "$noharmony_integrated_pca_pdf" ]; then + skip "$noharmony_integrated_pca_pdf exists and resume is set to 'true'" + fi + + run rm -f $noharmony_integrated_pca_pdf && eval "$scanpy plot embed $noharmony_plt_embed_opt $louvain_obj $noharmony_integrated_pca_pdf" + + [ "$status" -eq 0 ] + [ -f "$noharmony_integrated_pca_pdf" ] +} + +# Run Plot PCA embedding after harmony + +@test "Run Plot PCA embedding after Harmony" { + if [ "$resume" = 'true' ] && [ -f "$harmony_integrated_pca_pdf" ]; then + skip "$harmony_integrated_pca_pdf exists and resume is set to 'true'" + fi + + run rm -f $harmony_integrated_pca_pdf && eval "$scanpy plot embed $harmony_plt_embed_opt $harmony_integrate_obj $harmony_integrated_pca_pdf" + + [ "$status" -eq 0 ] + [ -f "$harmony_integrated_pca_pdf" ] +} + # Local Variables: # mode: sh # End: diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py index dc7a07d7..248aa59a 100755 --- a/scanpy_scripts/cli.py +++ b/scanpy_scripts/cli.py @@ -30,6 +30,7 @@ PLOT_DOT_CMD, PLOT_MATRIX_CMD, PLOT_HEATMAP_CMD, + HARMONY_INTEGRATE_CMD, ) @@ -101,6 +102,13 @@ def cluster(): cli.add_command(DPT_CMD) +@cli.group(cls=NaturalOrderGroup) +def integrate(): + """Integrate cells from different experimental batches.""" + +integrate.add_command(HARMONY_INTEGRATE_CMD) + + @cli.group(cls=NaturalOrderGroup) def plot(): """Visualise data.""" diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 36e8276e..ce3a071b 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1269,6 +1269,35 @@ ), ], + 'harmony_integrate': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + click.option( + '--basis', + type=click.STRING, + default='X_pca', + show_default=True, + help="The name of the field in adata.obsm where the PCA table is " + "stored. Defaults to 'X_pca', which is the default for sc.tl.pca()." + ), + click.option( + '--batch-key', 'key', + type=click.STRING, + required=True, + help='The name of the column in adata.obs that differentiates among ' + 'experiments/batches.' + ), + click.option( + '--adjusted-basis', + type=click.STRING, + default='X_pca_harmony', + show_default=True, + help='The name of the field in adata.obsm where the adjusted PCA ' + 'table will be stored after running this function.' + ), + + ], + 'embed': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['plot'], diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py index e697e69c..614418bf 100644 --- a/scanpy_scripts/cmds.py +++ b/scanpy_scripts/cmds.py @@ -5,6 +5,7 @@ import os import sys import scanpy as sc +import scanpy.external as sce from .cmd_utils import ( make_subcmd, @@ -210,3 +211,10 @@ arg_desc=_IP_DESC, opt_set='plot_paga' ) + +HARMONY_INTEGRATE_CMD = make_subcmd( + 'harmony_integrate', + sce.pp.harmony_integrate, + cmd_desc='Use harmonypy [Korunsky19] to integrate different experiments.', + arg_desc=_IO_DESC, +) From bbe1f524a34e22ef9881ee89aa51c3f777b1a949 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 11 Aug 2020 13:14:03 +0100 Subject: [PATCH 17/29] Add bbknn CLI layer --- scanpy-scripts-tests.bats | 17 +++++++ scanpy_scripts/cli.py | 2 + scanpy_scripts/cmd_options.py | 94 +++++++++++++++++++++++++++++++++++ scanpy_scripts/cmds.py | 8 +++ scanpy_scripts/lib/_bbknn.py | 28 +++++++++++ 5 files changed, 149 insertions(+) create mode 100644 scanpy_scripts/lib/_bbknn.py diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 08333678..39ee2d7f 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -75,6 +75,9 @@ setup() { noharmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings before harmony' --basis 'X_pca'" harmony_integrated_pca_pdf="${output_dir}/harmony_pca_${test_clustering}.pdf" noharmony_integrated_pca_pdf="${output_dir}/pca_${test_clustering}.pdf" + bbknn_obj="${output_dir}/bbknn.h5ad" + bbknn_opt="--batch-key ${test_clustering} --key-added bbknn" + if [ ! -d "$data_dir" ]; then mkdir -p $data_dir @@ -488,6 +491,20 @@ setup() { [ -f "$harmony_integrated_pca_pdf" ] } +# Do bbknn batch correction, using clustering as batch (just for test purposes) + +@test "Run BBKNN batch integration using clustering as batch" { + if [ "$resume" = 'true' ] && [ -f "$bbknn_obj" ]; then + skip "$bbknn_obj exists and resume is set to 'true'" + fi + + run rm -f $bbknn_obj && echo "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" && eval "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" + + [ "$status" -eq 0 ] + [ -f "$plt_rank_genes_groups_matrix_pdf" ] + +} + # Local Variables: # mode: sh # End: diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py index 248aa59a..a49f2f03 100755 --- a/scanpy_scripts/cli.py +++ b/scanpy_scripts/cli.py @@ -31,6 +31,7 @@ PLOT_MATRIX_CMD, PLOT_HEATMAP_CMD, HARMONY_INTEGRATE_CMD, + BBKNN_CMD, ) @@ -107,6 +108,7 @@ def integrate(): """Integrate cells from different experimental batches.""" integrate.add_command(HARMONY_INTEGRATE_CMD) +integrate.add_command(BBKNN_CMD) @cli.group(cls=NaturalOrderGroup) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index ce3a071b..c1aa2298 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1298,6 +1298,100 @@ ], + 'bbknn': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + COMMON_OPTIONS['key_added'], + click.option( + '--use-rep', '-u', + type=click.STRING, + default='X_pca', + show_default=True, + help='The dimensionality reduction in .obsm to use for neighbour ' + 'detection.' + ), + COMMON_OPTIONS['use_pc'][0], # --n-pcs + click.option( + '--batch-key', 'batch_key', + type=click.STRING, + required=True, + help='adata.obs column name discriminating between your batches.' + ), + click.option( + '--no-approx', 'approx', + is_flag=True, + default=True, + help='Default behaviour is to use annoy’s approximate neighbour ' + 'finding. This results in a quicker run time for large datasets while also ' + 'potentially increasing the degree of batch correction. Use this flag to disable ' + 'that behaviour.', + ), + click.option( + '--metric', '-t', + type=click.Choice(['angular', 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']), + default='angular', + show_default=True, + help='A known metric’s name.' + ), + click.option( + '--neighbors-within-batch', + type=click.INT, + default=3, + show_default=True, + help='How many top neighbours to report for each batch; total ' + 'number of neighbours will be this number times the number of batches.' + ), + click.option( + '--trim', + type=click.INT, + default=None, + show_default=True, + help='Trim the neighbours of each cell to these many top ' + 'connectivities. May help with population independence and improve the tidiness ' + 'of clustering. The lower the value the more independent the individual ' + 'populations, at the cost of more conserved batch effect. If None, sets the ' + 'parameter value automatically to 10 times the total number of neighbours for ' + 'each cell. Set to 0 to skip.' + ), + click.option( + '--n-trees', + type=click.INT, + default=10, + show_default=True, + help='Only used when approx=True. The number of trees to construct ' + 'in the annoy forest. More trees give higher precision when querying, at the ' + 'cost of increased run time and resource intensity.' + ), + click.option( + '--no-use-faiss', 'use_faiss', + is_flag=True, + default=True, + help='Default behaviour If approx=False and the metric is ' + '“euclidean”, is to use the faiss package to compute nearest neighbours if ' + 'installed. This improves performance at a minor cost to numerical precision as ' + 'faiss operates on float32. Use this flag to disable that behaviour.' + ), + click.option( + '--set-op-mix-ratio', + type=click.FLOAT, + default=1, + show_default=True, + help='UMAP connectivity computation parameter, float between 0 and ' + '1, controlling the blend between a connectivity matrix formed exclusively from ' + 'mutual nearest neighbour pairs (0) and a union of all observed neighbour ' + 'relationships with the mutual pairs emphasised (1).' + ), + click.option( + '--local-connectivity', + type=click.INT, + default=1, + show_default=True, + help='UMAP connectivity computation parameter, how many nearest ' + 'neighbors of each cell are assumed to be fully connected (and given a ' + 'connectivity value of 1)' + ), + ], + 'embed': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['plot'], diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py index 614418bf..7c94a503 100644 --- a/scanpy_scripts/cmds.py +++ b/scanpy_scripts/cmds.py @@ -26,6 +26,7 @@ from .lib._paga import paga from .lib._diffmap import diffmap from .lib._dpt import dpt +from .lib._bbknn import bbknn LANG = os.environ.get('LANG', None) @@ -218,3 +219,10 @@ cmd_desc='Use harmonypy [Korunsky19] to integrate different experiments.', arg_desc=_IO_DESC, ) + +BBKNN_CMD = make_subcmd( + 'bbknn', + bbknn, + cmd_desc='Batch balanced kNN [Polanski19].', + arg_desc=_IO_DESC, +) diff --git a/scanpy_scripts/lib/_bbknn.py b/scanpy_scripts/lib/_bbknn.py new file mode 100644 index 00000000..4834dae9 --- /dev/null +++ b/scanpy_scripts/lib/_bbknn.py @@ -0,0 +1,28 @@ +""" +scanpy external bbknn +""" + +import scanpy.external as sce + +from ..obj_utils import ( + _backup_default_key, + _delete_backup_key, + _rename_default_key, +) + +# Wrapper for bbknn allowing use of non-standard slot + +def bbknn(adata, key_added=None, **kwargs): + """ + Wrapper function for sce.pp.bbknn(), for supporting non-standard neighbors slot + """ + + _backup_default_key(adata.uns, 'neighbors') + sce.pp.bbknn(adata, **kwargs) + + if key_added: + _rename_default_key(adata.uns, 'neighbors', f'neighbors_{key_added}') + else: + _delete_backup_key(adata.uns, 'neighbors') + + return adata From 8e6e860929aa943a34437bdfd71882a9264cf234 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 11 Aug 2020 16:41:46 +0100 Subject: [PATCH 18/29] Add batch correction methods to dependencies for testing --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ca4e0eef..c94b1ce3 100644 --- a/setup.py +++ b/setup.py @@ -47,6 +47,8 @@ 'loompy>=2.0.0,<3.0.0', 'MulticoreTSNE', 'Click', - 'umap-learn<0.4.0' + 'umap-learn<0.4.0', + 'harmonypy>=0.0.5', + 'bbknn>=1.3.12' ], ) From 2a564e79a83aeed37c8ea333fbf2c9db508f9d86 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 11 Aug 2020 16:42:27 +0100 Subject: [PATCH 19/29] Add more harmonypy options --- scanpy_scripts/cmd_options.py | 84 ++++++++++++++++++++++++++++++++++- 1 file changed, 83 insertions(+), 1 deletion(-) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index c1aa2298..e2d6219d 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1295,7 +1295,89 @@ help='The name of the field in adata.obsm where the adjusted PCA ' 'table will be stored after running this function.' ), - + click.option( + '--theta', + type=click.FLOAT, + default=2, + show_default=True, + help='Diversity clustering penalty parameter. theta=0 does not encourage any ' + 'diversity. Larger values of theta result in more diverse clusters.' + ), + click.option( + '--lambda', 'lamb', + type=click.FLOAT, + default=1, + show_default=True, + help='Ridge regression penalty parameter. Lambda must be strictly ' + 'positive. Smaller values result in more aggressive correction.' + ), + click.option( + '--sigma', + type=click.FLOAT, + default=0.1, + show_default=True, + help='Width of soft kmeans clusters. Sigma scales the distance from ' + 'a cell to cluster centroids. Larger values of sigma result in cells assigned to ' + 'more clusters. Smaller values of sigma make soft kmeans cluster approach hard ' + 'clustering.' + ), + click.option( + '--n-clust', 'nclust', + type=click.INT, + default=None, + show_default=False, + help='Number of clusters in model. nclust=1 equivalent to simple ' + 'linear regression.' + ), + click.option( + '--tau', + type=click.INT, + default=0, + show_default=True, + help='Protection against overclustering small datasets with large ones. ' + 'tau is the expected number of cells per cluster.' + ), + click.option( + '--block-size', + type=click.FLOAT, + default=0.05, + show_default=True, + help='What proportion of cells to update during clustering. Between ' + '0 to 1, default 0.05. Larger values may be faster but less accurate.' + ), + click.option( + '--max-iter-cluster', 'max_iter_kmeans', + type=click.INT, + default=20, + show_default=True, + help='Maximum number of rounds to run clustering at each round of ' + 'Harmony.' + ), + click.option( + '--max-iter-harmony', + type=click.INT, + default=10, + show_default=True, + help='Maximum number of rounds to run Harmony. One round of Harmony ' + 'involves one clustering and one correction step.' + ), + click.option( + '--epsilon-cluster', + type=click.FLOAT, + default=1e-5, + show_default=True, + help='Convergence tolerance for clustering round of Harmony Set to ' + '-Inf to never stop early.' + ), + click.option( + '--epsilon-harmony', + type=click.FLOAT, + default=1e-5, + show_default=True, + help='Convergence tolerance for clustering round of Harmony Set to ' + '-Inf to never stop early.' + ), + COMMON_OPTIONS['random_state'], ], 'bbknn': [ From 0faa36948e028e224f0cf190f98342512d8ad790 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 13 Aug 2020 19:31:34 +0100 Subject: [PATCH 20/29] Added MNN batch correction --- scanpy-scripts-tests.bats | 18 +++++- scanpy_scripts/cli.py | 2 + scanpy_scripts/cmd_options.py | 102 ++++++++++++++++++++++++++++++++++ scanpy_scripts/cmds.py | 8 +++ scanpy_scripts/lib/_mnn.py | 85 ++++++++++++++++++++++++++++ 5 files changed, 214 insertions(+), 1 deletion(-) create mode 100644 scanpy_scripts/lib/_mnn.py diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 39ee2d7f..ec12d447 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -77,6 +77,8 @@ setup() { noharmony_integrated_pca_pdf="${output_dir}/pca_${test_clustering}.pdf" bbknn_obj="${output_dir}/bbknn.h5ad" bbknn_opt="--batch-key ${test_clustering} --key-added bbknn" + mnn_obj="${output_dir}/mnn.h5ad" + mnn_opt="--batch-key ${test_clustering}" if [ ! -d "$data_dir" ]; then @@ -498,13 +500,27 @@ setup() { skip "$bbknn_obj exists and resume is set to 'true'" fi - run rm -f $bbknn_obj && echo "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" && eval "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" + run rm -f $bbknn_obj && eval "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" [ "$status" -eq 0 ] [ -f "$plt_rank_genes_groups_matrix_pdf" ] } +# Do bbknn batch correction, using clustering as batch (just for test purposes) + +@test "Run MNN batch integration using clustering as batch" { + if [ "$resume" = 'true' ] && [ -f "$mnn_obj" ]; then + skip "$mnn_obj exists and resume is set to 'true'" + fi + + run rm -f $mnn_obj && eval "$scanpy integrate mnn_correct $mnn_opt $louvain_obj $mnn_obj" + + [ "$status" -eq 0 ] + [ -f "$mnn_obj" ] + +} + # Local Variables: # mode: sh # End: diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py index a49f2f03..0fee8b9f 100755 --- a/scanpy_scripts/cli.py +++ b/scanpy_scripts/cli.py @@ -32,6 +32,7 @@ PLOT_HEATMAP_CMD, HARMONY_INTEGRATE_CMD, BBKNN_CMD, + MNN_CORRECT_CMD, ) @@ -109,6 +110,7 @@ def integrate(): integrate.add_command(HARMONY_INTEGRATE_CMD) integrate.add_command(BBKNN_CMD) +integrate.add_command(MNN_CORRECT_CMD) @cli.group(cls=NaturalOrderGroup) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index e2d6219d..e02ccc1f 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1380,6 +1380,108 @@ COMMON_OPTIONS['random_state'], ], + 'mnn_correct': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + click.option( + '--layer', '-l', + type=click.STRING, + default=None, + show_default=True, + help="Layer to batch correct. By default corrects the contents of .X." + ), + click.option( + '--key-added', + type=click.STRING, + default=None, + show_default=True, + help="Key under which to add the computed results. By default a new " + "layer will be created called 'mnnnn', 'bbknn_{layer}' or " + "'bbknn_layer_{key_added}' where those parameters were specified. A value of 'X' " + "causes batch-corrected values to overwrite the original content of .X." + ), + click.option( + '--var-subset', + type=(click.STRING, CommaSeparatedText()), + multiple=True, + help="The subset of vars (list of str) to be used when performing " + "MNN correction in the format of '--var-subset '. Typically, use " + "the highly variable genes (HVGs) like '--var-subset highly_variable True'. When " + "unset, uses all vars." + ), + click.option( + '--batch-key', 'batch_key', + type=click.STRING, + required=True, + help='adata.obs column name discriminating between your batches.' + ), + click.option( + '--n-neighbors', '-k', + type=CommaSeparatedText(click.INT, simplify=True), + default=20, + show_default=True, + help='Number of mutual nearest neighbors.' + ), + click.option( + '--sigma', + type=click.FLOAT, + default=1.0, + show_default=True, + help='The bandwidth of the Gaussian smoothing kernel used to ' + 'compute the correction vectors.' + ), + click.option( + '--no-cos_norm_in', 'cos_norm_in', + is_flag=True, + default=True, + help='Default behaviour is to perform cosine normalization on the ' + 'input data prior to calculating distances between cells. Use this ' + 'flag to disable that behaviour.' + ), + click.option( + '--no-cos_norm_out', 'cos_norm_out', + is_flag=True, + default=True, + help='Default behaviour is to perform cosine normalization prior to ' + 'computing corrected expression values. Use this flag to disable that ' + 'behaviour.' + ), + click.option( + '--svd-dim', + type=click.INT, + default=None, + show_default=True, + help='The number of dimensions to use for summarizing biological ' + 'substructure within each batch. If not set, biological components ' + 'will not be removed from the correction vectors.' + ), + click.option( + '--no-var-adj', + is_flag=True, + default=True, + help='Default behaviour is to adjust variance of the correction ' + 'vectors. Use this flag to disable that behaviour. Note this step takes most ' + 'computing time.' + ), + click.option( + '--compute-angle', + is_flag=True, + default=False, + help='When set, compute the angle between each cell’s correction ' + 'vector and the biological subspace of the reference batch.' + ), + click.option( + '--svd-mode', + type=click.Choice(['svd', 'rsvd', 'irlb']), + default='rsvd', + show_default=True, + help="'svd' computes SVD using a non-randomized SVD-via-ID " + "algorithm, while 'rsvd' uses a randomized version. 'irlb' performs truncated " + "SVD by implicitly restarted Lanczos bidiagonalization (forked from " + "https://github.com/airysen/irlbpy)." + ), + ], + 'bbknn': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py index 7c94a503..1cd32732 100644 --- a/scanpy_scripts/cmds.py +++ b/scanpy_scripts/cmds.py @@ -27,6 +27,7 @@ from .lib._diffmap import diffmap from .lib._dpt import dpt from .lib._bbknn import bbknn +from .lib._mnn import mnn_correct LANG = os.environ.get('LANG', None) @@ -226,3 +227,10 @@ cmd_desc='Batch balanced kNN [Polanski19].', arg_desc=_IO_DESC, ) + +MNN_CORRECT_CMD = make_subcmd( + 'mnn_correct', + mnn_correct, + cmd_desc='Correct batch effects by matching mutual nearest neighbors [Haghverdi18] [Kang18].', + arg_desc=_IO_DESC, +) diff --git a/scanpy_scripts/lib/_mnn.py b/scanpy_scripts/lib/_mnn.py new file mode 100644 index 00000000..c84358ab --- /dev/null +++ b/scanpy_scripts/lib/_mnn.py @@ -0,0 +1,85 @@ +""" +scanpy external mnn +""" + +import scanpy.external as sce +import numpy as np +import click + +# Wrapper for mnn allowing use of non-standard slot + +def mnn_correct(adata, batch_key=None, key_added=None, var_subset=None, layer=None, **kwargs): + """ + Wrapper function for sce.pp.mnn_correct(), for supporting non-standard neighbors slot + """ + + # mnn will use .X, so we need to put other layers there for processing + + if layer: + adata.layers['X_backup'] = adata.X + adata.X = adata.layers[layer] + + # mnn_correct() wants batches in separate adatas + + batches = np.unique(adata.obs[batch_key]) + alldata = [] + for batch in batches: + alldata.append( adata[adata.obs[batch_key] == batch,] ) + + # Process var_subset into a list of strings that can be provided to + # mnn_correct() + + if var_subset is not None and len(var_subset) > 0 and var_subset[0] is not None: + + subset = [] + + for name, values in var_subset : + if name in adata.var: + if adata.var[name].dtype == 'bool': + values = [ True if x.lower() == "true" else x for x in values ] + else: + raise click.ClickException(f'Var "{name}" unavailable') + + ind = [ x in values for x in adata.var[name] ] + subset = subset + adata.var.index[ ind ].to_list() + + var_subset = set(subset) + print('Will use %d selected genes for MNN' % len(var_subset)) + + else: + var_subset = None + + # Here's the main bit + + cdata = sce.pp.mnn_correct(*alldata, var_subset = var_subset, do_concatenate = True, index_unique = None, **kwargs) + + # If user has specified key_added = X then they want us to overwrite .X, + # othwerwise copy the .X to a named layer of the original object. In either + # case make sure obs and var are the same as the original. + + if key_added is None or key_added != 'X': + + mnn_key = 'mnn' + if layer: + mnn_key = f"{mnn_key}_{layer}" + + # Layers is set (so we're not storing computed results in the .X, + # and we had to overwrite .X to run mnn), and key_added shows we're + # not storing in the .X, so we need to restore from the backup. + + adata.X = adata.layers['X_backup'] + + if key_added: + mnn_key = f"{mnn_key}_{key_added}" + + adata.layers[mnn_key] = cdata[0][adata.obs.index, adata.var.index].X + + else: + adata.X = cdata[0][adata.obs.index, adata.var.index].X + + # Delete the backup of .X if we needed one + + if layer: + del adata.layers['X_backup'] + + return adata From 4fb0b96af4bd307b7fdfd2ea6ec4288876233bb1 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Thu, 13 Aug 2020 19:37:53 +0100 Subject: [PATCH 21/29] add mnnpy to deps --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c94b1ce3..3760c759 100644 --- a/setup.py +++ b/setup.py @@ -49,6 +49,7 @@ 'Click', 'umap-learn<0.4.0', 'harmonypy>=0.0.5', - 'bbknn>=1.3.12' + 'bbknn>=1.3.12', + 'mnnpy>=0.1.9.5' ], ) From 924a74599ba74b3fcc4b51bede2118b800eeee1b Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 14 Aug 2020 15:22:01 +0100 Subject: [PATCH 22/29] Minor fixes, add ComBat, unify args --- scanpy-scripts-tests.bats | 17 +++++++- scanpy_scripts/cli.py | 2 + scanpy_scripts/cmd_options.py | 82 ++++++++++++++++++++++------------- scanpy_scripts/cmds.py | 8 ++++ scanpy_scripts/lib/_bbknn.py | 4 +- scanpy_scripts/lib/_mnn.py | 6 +-- 6 files changed, 83 insertions(+), 36 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index ec12d447..0839d19a 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -79,6 +79,8 @@ setup() { bbknn_opt="--batch-key ${test_clustering} --key-added bbknn" mnn_obj="${output_dir}/mnn.h5ad" mnn_opt="--batch-key ${test_clustering}" + combat_obj="${output_dir}/combat.h5ad" + combat_opt="--batch-key ${test_clustering}" if [ ! -d "$data_dir" ]; then @@ -504,10 +506,9 @@ setup() { [ "$status" -eq 0 ] [ -f "$plt_rank_genes_groups_matrix_pdf" ] - } -# Do bbknn batch correction, using clustering as batch (just for test purposes) +# Do MNN batch correction, using clustering as batch (just for test purposes) @test "Run MNN batch integration using clustering as batch" { if [ "$resume" = 'true' ] && [ -f "$mnn_obj" ]; then @@ -518,7 +519,19 @@ setup() { [ "$status" -eq 0 ] [ -f "$mnn_obj" ] +} + +# Do ComBat batch correction, using clustering as batch (just for test purposes) +@test "Run Combat batch integration using clustering as batch" { + if [ "$resume" = 'true' ] && [ -f "$combat_obj" ]; then + skip "$combat_obj exists and resume is set to 'true'" + fi + + run rm -f $combat_obj && eval "$scanpy integrate combat $combat_opt $louvain_obj $combat_obj" + + [ "$status" -eq 0 ] + [ -f "$combat_obj" ] } # Local Variables: diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py index 0fee8b9f..b5ed0bfc 100755 --- a/scanpy_scripts/cli.py +++ b/scanpy_scripts/cli.py @@ -33,6 +33,7 @@ HARMONY_INTEGRATE_CMD, BBKNN_CMD, MNN_CORRECT_CMD, + COMBAT_CMD, ) @@ -111,6 +112,7 @@ def integrate(): integrate.add_command(HARMONY_INTEGRATE_CMD) integrate.add_command(BBKNN_CMD) integrate.add_command(MNN_CORRECT_CMD) +integrate.add_command(COMBAT_CMD) @cli.group(cls=NaturalOrderGroup) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index e02ccc1f..1185a96e 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -503,7 +503,23 @@ help='Key for categorical in `.obs`. You can pass your predefined ' 'groups by choosing any categorical annotation of observations.', ), - ] + ], + + 'batch_key': click.option( + '--batch-key', 'key', + type=click.STRING, + required=True, + help='The name of the column in adata.obs that differentiates among ' + 'experiments/batches.' + ), + + 'batch_layer': click.option( + '--layer', '-l', + type=click.STRING, + default=None, + show_default=True, + help="Layer to batch correct. By default corrects the contents of .X." + ), } CMD_OPTIONS = { @@ -1269,9 +1285,40 @@ ), ], + 'combat': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + COMMON_OPTIONS['batch_key'], + COMMON_OPTIONS['batch_layer'], + click.option( + '--key-added', + type=click.STRING, + default=None, + show_default=True, + help="Key under which to add the computed results. By default a new " + "layer will be created called 'combat', 'combat_{layer}' or " + "'combat_layer_{key_added}' where those parameters were specified. A value of 'X' " + "causes batch-corrected values to overwrite the original content of .X." + ), + click.option( + '--covariates', + type=(CommaSeparatedText()), + default=None, + show_default=True, + help="Comma-separated list of additional covariates besides the " + "batch variable such as adjustment variables or biological condition. This " + "parameter refers to the design matrix X in Equation 2.1 in [Johnson07] and to " + "the mod argument in the original combat function in the sva R package. Note " + "that not including covariates may introduce bias or lead to the removal of " + "biological signal in unbalanced designs." + ), + + ], + 'harmony_integrate': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], + COMMON_OPTIONS['batch_key'], click.option( '--basis', type=click.STRING, @@ -1280,13 +1327,6 @@ help="The name of the field in adata.obsm where the PCA table is " "stored. Defaults to 'X_pca', which is the default for sc.tl.pca()." ), - click.option( - '--batch-key', 'key', - type=click.STRING, - required=True, - help='The name of the column in adata.obs that differentiates among ' - 'experiments/batches.' - ), click.option( '--adjusted-basis', type=click.STRING, @@ -1383,21 +1423,16 @@ 'mnn_correct': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], - click.option( - '--layer', '-l', - type=click.STRING, - default=None, - show_default=True, - help="Layer to batch correct. By default corrects the contents of .X." - ), + COMMON_OPTIONS['batch_key'], + COMMON_OPTIONS['batch_layer'], click.option( '--key-added', type=click.STRING, default=None, show_default=True, help="Key under which to add the computed results. By default a new " - "layer will be created called 'mnnnn', 'bbknn_{layer}' or " - "'bbknn_layer_{key_added}' where those parameters were specified. A value of 'X' " + "layer will be created called 'mnn', 'mnn_{layer}' or " + "'mnn_layer_{key_added}' where those parameters were specified. A value of 'X' " "causes batch-corrected values to overwrite the original content of .X." ), click.option( @@ -1409,12 +1444,6 @@ "the highly variable genes (HVGs) like '--var-subset highly_variable True'. When " "unset, uses all vars." ), - click.option( - '--batch-key', 'batch_key', - type=click.STRING, - required=True, - help='adata.obs column name discriminating between your batches.' - ), click.option( '--n-neighbors', '-k', type=CommaSeparatedText(click.INT, simplify=True), @@ -1486,6 +1515,7 @@ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], COMMON_OPTIONS['key_added'], + COMMON_OPTIONS['batch_key'], click.option( '--use-rep', '-u', type=click.STRING, @@ -1495,12 +1525,6 @@ 'detection.' ), COMMON_OPTIONS['use_pc'][0], # --n-pcs - click.option( - '--batch-key', 'batch_key', - type=click.STRING, - required=True, - help='adata.obs column name discriminating between your batches.' - ), click.option( '--no-approx', 'approx', is_flag=True, diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py index 1cd32732..5b9c5470 100644 --- a/scanpy_scripts/cmds.py +++ b/scanpy_scripts/cmds.py @@ -28,6 +28,7 @@ from .lib._dpt import dpt from .lib._bbknn import bbknn from .lib._mnn import mnn_correct +from .lib._combat import combat LANG = os.environ.get('LANG', None) @@ -214,6 +215,13 @@ opt_set='plot_paga' ) +COMBAT_CMD = make_subcmd( + 'combat', + combat, + cmd_desc='ComBat function for batch effect correction', + arg_desc=_IO_DESC +) + HARMONY_INTEGRATE_CMD = make_subcmd( 'harmony_integrate', sce.pp.harmony_integrate, diff --git a/scanpy_scripts/lib/_bbknn.py b/scanpy_scripts/lib/_bbknn.py index 4834dae9..a10cb3a0 100644 --- a/scanpy_scripts/lib/_bbknn.py +++ b/scanpy_scripts/lib/_bbknn.py @@ -12,13 +12,13 @@ # Wrapper for bbknn allowing use of non-standard slot -def bbknn(adata, key_added=None, **kwargs): +def bbknn(adata, key=None, key_added=None, **kwargs): """ Wrapper function for sce.pp.bbknn(), for supporting non-standard neighbors slot """ _backup_default_key(adata.uns, 'neighbors') - sce.pp.bbknn(adata, **kwargs) + sce.pp.bbknn(adata, batch_key = key, **kwargs) if key_added: _rename_default_key(adata.uns, 'neighbors', f'neighbors_{key_added}') diff --git a/scanpy_scripts/lib/_mnn.py b/scanpy_scripts/lib/_mnn.py index c84358ab..0a559a64 100644 --- a/scanpy_scripts/lib/_mnn.py +++ b/scanpy_scripts/lib/_mnn.py @@ -8,7 +8,7 @@ # Wrapper for mnn allowing use of non-standard slot -def mnn_correct(adata, batch_key=None, key_added=None, var_subset=None, layer=None, **kwargs): +def mnn_correct(adata, key=None, key_added=None, var_subset=None, layer=None, **kwargs): """ Wrapper function for sce.pp.mnn_correct(), for supporting non-standard neighbors slot """ @@ -21,10 +21,10 @@ def mnn_correct(adata, batch_key=None, key_added=None, var_subset=None, layer=No # mnn_correct() wants batches in separate adatas - batches = np.unique(adata.obs[batch_key]) + batches = np.unique(adata.obs[key]) alldata = [] for batch in batches: - alldata.append( adata[adata.obs[batch_key] == batch,] ) + alldata.append( adata[adata.obs[key] == batch,] ) # Process var_subset into a list of strings that can be provided to # mnn_correct() From 79ad11cf5c7fd2a0582f474d7690006756c93733 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 14 Aug 2020 17:05:11 +0100 Subject: [PATCH 23/29] Set batch handling to scanpy-native method for hvg, to properly respond to flavor = 'seurat_v3' --- scanpy_scripts/cmd_options.py | 26 ++++++++++++++----- scanpy_scripts/lib/_hvg.py | 48 ++++++++--------------------------- 2 files changed, 30 insertions(+), 44 deletions(-) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 1185a96e..f0dbf7e6 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -713,6 +713,14 @@ help='Cutoffs for the dispersion of expression' 'in the format of "-d min max".', ), + click.option( + '--span', + type=click.FLOAT, + default=0.3, + show_default=True, + help="The fraction of the data (cells) used when estimating the " + "variance in the loess model fit if flavor='seurat_v3'." + ), click.option( '--n-bins', '-b', type=click.INT, @@ -729,7 +737,7 @@ ), click.option( '--flavor', '-v', - type=click.Choice(['seurat', 'cellranger']), + type=click.Choice(['seurat', 'cellranger', 'seurat_v3']), default='seurat', show_default=True, help='Choose the flavor for computing normalized dispersion.', @@ -742,12 +750,16 @@ 'only flag highly-variable genes.', ), click.option( - '--by-batch', '-B', - type=(click.STRING, click.INT), - default=(None, None), - show_default=True, - help='Find highly variable genes within each batch defined by ' - 'then pool and keep those found in at least batches.', + '--batch-key', 'batch_key', + type=click.STRING, + default=None, + help="If specified, highly-variable genes are selected within each " + "batch separately and merged. This simple process avoids the selection of " + "batch-specific genes and acts as a lightweight batch correction method. For all " + "flavors, genes are first sorted by how many batches they are a HVG. For " + "dispersion-based flavors ties are broken by normalized dispersion. If flavor = " + "'seurat_v3', ties are broken by the median (across batches) rank based on " + "within-batch normalized variance." ), ], diff --git a/scanpy_scripts/lib/_hvg.py b/scanpy_scripts/lib/_hvg.py index a4101729..3cd53b73 100644 --- a/scanpy_scripts/lib/_hvg.py +++ b/scanpy_scripts/lib/_hvg.py @@ -9,50 +9,24 @@ def hvg( adata, mean_limits=(0.0125, 3), disp_limits=(0.5, float('inf')), - subset=False, - by_batch=None, **kwargs, ): """ - Wrapper function for sc.highly_variable_genes(), mainly to support searching - by batch and pooling. + Wrapper function for sc.highly_variable_genes() """ # Check for n_top_genes beeing greater than the total genes if 'n_top_genes' in kwargs and kwargs['n_top_genes'] is not None: kwargs['n_top_genes'] = min(adata.n_vars, kwargs['n_top_genes']) - - if by_batch and isinstance(by_batch, (list, tuple)) and by_batch[0]: - batch_name = by_batch[0] - min_n = by_batch[1] - k_hvg = np.zeros(adata.shape[1], dtype=int) - for batch_value in adata.obs[batch_name].unique(): - k_v = adata.obs[batch_name] == batch_value - ad = adata[k_v, :] - k_f, _ = sc.pp.filter_genes(ad, min_cells=3, inplace=False) - hvg = sc.pp.highly_variable_genes( - ad[:, k_f], - min_mean=mean_limits[0], - max_mean=mean_limits[1], - min_disp=disp_limits[0], - max_disp=disp_limits[1], - inplace=False, - **kwargs, - )['highly_variable'] - k_hvg[k_f] += hvg - if subset: - adata = adata[:, k_hvg >= min_n] - else: - adata.var['highly_variable'] = k_hvg >= min_n - else: - sc.pp.highly_variable_genes( - adata, - min_mean=mean_limits[0], - max_mean=mean_limits[1], - min_disp=disp_limits[0], - max_disp=disp_limits[1], - subset=subset, - **kwargs, - ) + + sc.pp.highly_variable_genes( + adata, + min_mean=mean_limits[0], + max_mean=mean_limits[1], + min_disp=disp_limits[0], + max_disp=disp_limits[1], + **kwargs, + ) + return adata From 8dde17005afdff5ad393e009ccc26f9dc522ef7c Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Fri, 14 Aug 2020 17:15:31 +0100 Subject: [PATCH 24/29] Add new rank_genes_groups options --- scanpy_scripts/cmd_options.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index f0dbf7e6..e111daba 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1180,6 +1180,21 @@ help='Rank genes by the absolute value of the score, not by the score. ' 'The returned scores are never the absolute values.', ), + click.option( + '--pts', + is_flag=True, + default=False, + show_default=True, + help='Compute the fraction of cells expressing the genes.' + ), + click.option( + '--tie-correct', + is_flag=True, + default=False, + show_default=True, + help="Use tie correction for 'wilcoxon' scores. Used only for " + "'wilcoxon'." + ), click.option( '--filter-params', type=Dictionary(keys=[ From 494ff24f67975d138e58413ab2522fca74283745 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 18 Aug 2020 11:15:07 +0100 Subject: [PATCH 25/29] Bump Scanpy --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3760c759..de3a2f82 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ 'matplotlib', 'pandas', 'h5py', - 'scanpy>=1.5.1', + 'scanpy>=1.6.0', 'louvain', 'leidenalg', 'loompy>=2.0.0,<3.0.0', From d50cb627b471d6b1936f09e3ccdb6e71b6cb046d Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 18 Aug 2020 11:41:36 +0100 Subject: [PATCH 26/29] Whoops- forgot to add the combat file --- scanpy_scripts/lib/_combat.py | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 scanpy_scripts/lib/_combat.py diff --git a/scanpy_scripts/lib/_combat.py b/scanpy_scripts/lib/_combat.py new file mode 100644 index 00000000..7ef65bea --- /dev/null +++ b/scanpy_scripts/lib/_combat.py @@ -0,0 +1,51 @@ +""" +scanpy combat +""" + +import scanpy as sc + +# Wrapper for mnn allowing use of non-standard slot + +def combat(adata, key=None, key_added=None, layer=None, **kwargs): + """ + Wrapper function for scanpy.pp.combat(), for supporting non-standard slots + """ + + # If layer is set then we have to move the contents of that layer into + # .X for analysis. We back up the original .X, but only if the user hasn't + # specified to overwrite it anyway. + + if layer: + if key_added and key_added != 'X': + adata.layers['X_backup'] = adata.X + + adata.X = adata.layers[layer] + + # If we're storing results in .X (whether from .X or from a layer), run in + # place to save copying objects. + + if key_added and key_added == 'X': + sc.pp.combat(adata, key = key, **kwargs) + + # If we're storing in 'layers' (key_added is not set, or is not X, then + # don't run in place, and put the matrix in the specified layer. + + else: + + cdata = sc.pp.combat(adata, key=key, inplace = False, **kwargs) + + combat_key = 'combat' + if layer: + combat_key = f"{combat_key}_{layer}" + + # If we ran from a layer, restore the .X we had to overwrite + + adata.X = adata.layers['X_backup'] + del adata.layers['X_backup'] + + if key_added: + combat_key = f"{combat_key}_{key_added}" + + adata.layers[combat_key] = cdata + + return adata From c41469325e14f9c5770417c282b7d7001780c761 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Tue, 18 Aug 2020 13:48:00 +0100 Subject: [PATCH 27/29] Add help text with integration methods --- README.md | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 8f7fb640..8558a004 100644 --- a/README.md +++ b/README.md @@ -37,18 +37,19 @@ Options: --help Show this message and exit. Commands: - read Read 10x data and save in specified format. - filter Filter data based on specified conditions. - norm Normalise data per cell. - hvg Find highly variable genes. - scale Scale data per gene. - regress Regress-out observation variables. - pca Dimensionality reduction by PCA. - neighbor Compute a neighbourhood graph of observations. - embed Embed cells into two-dimensional space. - cluster Cluster cells into sub-populations. - diffexp Find markers for each clusters. - paga Trajectory inference by abstract graph analysis. - dpt Calculate diffusion pseudotime relative to the root cells. - plot Visualise data. + read Read 10x data and save in specified format. + filter Filter data based on specified conditions. + norm Normalise data per cell. + hvg Find highly variable genes. + scale Scale data per gene. + regress Regress-out observation variables. + pca Dimensionality reduction by PCA. + neighbor Compute a neighbourhood graph of observations. + embed Embed cells into two-dimensional space. + cluster Cluster cells into sub-populations. + diffexp Find markers for each clusters. + paga Trajectory inference by abstract graph analysis. + dpt Calculate diffusion pseudotime relative to the root cells. + integrate Integrate cells from different experimental batches. + plot Visualise data. ``` From 987d65505041ca0e8d086294b483b1cb50630808 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Sun, 23 Aug 2020 21:46:26 +0100 Subject: [PATCH 28/29] Simplify commands --- scanpy-scripts-tests.bats | 4 ++-- scanpy_scripts/cmds.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 28ce02aa..4c846ce0 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -462,7 +462,7 @@ setup() { skip "$harmony_integrate_obj exists and resume is set to 'true'" fi - run rm -f $harmony_integrate_obj && eval "$scanpy integrate harmony_integrate $harmony_integrate_opt $louvain_obj $harmony_integrate_obj" + run rm -f $harmony_integrate_obj && eval "$scanpy integrate harmony $harmony_integrate_opt $louvain_obj $harmony_integrate_obj" [ "$status" -eq 0 ] [ -f "$plt_rank_genes_groups_matrix_pdf" ] @@ -515,7 +515,7 @@ setup() { skip "$mnn_obj exists and resume is set to 'true'" fi - run rm -f $mnn_obj && eval "$scanpy integrate mnn_correct $mnn_opt $louvain_obj $mnn_obj" + run rm -f $mnn_obj && eval "$scanpy integrate mnn $mnn_opt $louvain_obj $mnn_obj" [ "$status" -eq 0 ] [ -f "$mnn_obj" ] diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py index 5b9c5470..b955f68b 100644 --- a/scanpy_scripts/cmds.py +++ b/scanpy_scripts/cmds.py @@ -223,7 +223,7 @@ ) HARMONY_INTEGRATE_CMD = make_subcmd( - 'harmony_integrate', + 'harmony', sce.pp.harmony_integrate, cmd_desc='Use harmonypy [Korunsky19] to integrate different experiments.', arg_desc=_IO_DESC, @@ -237,7 +237,7 @@ ) MNN_CORRECT_CMD = make_subcmd( - 'mnn_correct', + 'mnn', mnn_correct, cmd_desc='Correct batch effects by matching mutual nearest neighbors [Haghverdi18] [Kang18].', arg_desc=_IO_DESC, From dc613d9df567e4613a6b1b820e29c85c823fb0d3 Mon Sep 17 00:00:00 2001 From: Jonathan Manning Date: Sun, 23 Aug 2020 21:50:15 +0100 Subject: [PATCH 29/29] Finish: simplify commands --- scanpy_scripts/cmd_options.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index df68b6da..03e16d6b 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -1342,7 +1342,7 @@ ], - 'harmony_integrate': [ + 'harmony': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], COMMON_OPTIONS['batch_key'], @@ -1447,7 +1447,7 @@ COMMON_OPTIONS['random_state'], ], - 'mnn_correct': [ + 'mnn': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['output'], COMMON_OPTIONS['batch_key'], @@ -1893,4 +1893,4 @@ COMMON_OPTIONS['swap_axes'], ], -} \ No newline at end of file +}