diff --git a/README.md b/README.md index 8f7fb640..8558a004 100644 --- a/README.md +++ b/README.md @@ -37,18 +37,19 @@ Options: --help Show this message and exit. Commands: - read Read 10x data and save in specified format. - filter Filter data based on specified conditions. - norm Normalise data per cell. - hvg Find highly variable genes. - scale Scale data per gene. - regress Regress-out observation variables. - pca Dimensionality reduction by PCA. - neighbor Compute a neighbourhood graph of observations. - embed Embed cells into two-dimensional space. - cluster Cluster cells into sub-populations. - diffexp Find markers for each clusters. - paga Trajectory inference by abstract graph analysis. - dpt Calculate diffusion pseudotime relative to the root cells. - plot Visualise data. + read Read 10x data and save in specified format. + filter Filter data based on specified conditions. + norm Normalise data per cell. + hvg Find highly variable genes. + scale Scale data per gene. + regress Regress-out observation variables. + pca Dimensionality reduction by PCA. + neighbor Compute a neighbourhood graph of observations. + embed Embed cells into two-dimensional space. + cluster Cluster cells into sub-populations. + diffexp Find markers for each clusters. + paga Trajectory inference by abstract graph analysis. + dpt Calculate diffusion pseudotime relative to the root cells. + integrate Integrate cells from different experimental batches. + plot Visualise data. ``` diff --git a/scanpy-scripts-tests.bats b/scanpy-scripts-tests.bats index 4022bedd..4c846ce0 100755 --- a/scanpy-scripts-tests.bats +++ b/scanpy-scripts-tests.bats @@ -69,6 +69,19 @@ setup() { plt_rank_genes_groups_matrix_pdf="${output_dir}/rggmatrix_${test_clustering}.pdf" plt_rank_genes_groups_dot_pdf="${output_dir}/rggdot_${test_clustering}.pdf" plt_rank_genes_groups_heatmap_pdf="${output_dir}/rggheatmap_${test_clustering}.pdf" + harmony_integrate_obj="${output_dir}/harmony_integrate.h5ad" + harmony_integrate_opt="--batch-key ${test_clustering}" + harmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings after harmony' --basis 'X_pca_harmony'" + noharmony_plt_embed_opt="--projection 2d --color ${test_clustering} --title 'PCA embeddings before harmony' --basis 'X_pca'" + harmony_integrated_pca_pdf="${output_dir}/harmony_pca_${test_clustering}.pdf" + noharmony_integrated_pca_pdf="${output_dir}/pca_${test_clustering}.pdf" + bbknn_obj="${output_dir}/bbknn.h5ad" + bbknn_opt="--batch-key ${test_clustering} --key-added bbknn" + mnn_obj="${output_dir}/mnn.h5ad" + mnn_opt="--batch-key ${test_clustering}" + combat_obj="${output_dir}/combat.h5ad" + combat_opt="--batch-key ${test_clustering}" + if [ ! -d "$data_dir" ]; then mkdir -p $data_dir @@ -442,6 +455,85 @@ setup() { [ -f "$plt_rank_genes_groups_matrix_pdf" ] } +# Do harmony batch correction, using clustering as batch (just for test purposes) + +@test "Run Harmony batch integration using clustering as batch" { + if [ "$resume" = 'true' ] && [ -f "$harmony_integrate_obj" ]; then + skip "$harmony_integrate_obj exists and resume is set to 'true'" + fi + + run rm -f $harmony_integrate_obj && eval "$scanpy integrate harmony $harmony_integrate_opt $louvain_obj $harmony_integrate_obj" + + [ "$status" -eq 0 ] + [ -f "$plt_rank_genes_groups_matrix_pdf" ] + +} + +# Run Plot PCA embedding before harmony + +@test "Run Plot PCA embedding before Harmony" { + if [ "$resume" = 'true' ] && [ -f "$noharmony_integrated_pca_pdf" ]; then + skip "$noharmony_integrated_pca_pdf exists and resume is set to 'true'" + fi + + run rm -f $noharmony_integrated_pca_pdf && eval "$scanpy plot embed $noharmony_plt_embed_opt $louvain_obj $noharmony_integrated_pca_pdf" + + [ "$status" -eq 0 ] + [ -f "$noharmony_integrated_pca_pdf" ] +} + +# Run Plot PCA embedding after harmony + +@test "Run Plot PCA embedding after Harmony" { + if [ "$resume" = 'true' ] && [ -f "$harmony_integrated_pca_pdf" ]; then + skip "$harmony_integrated_pca_pdf exists and resume is set to 'true'" + fi + + run rm -f $harmony_integrated_pca_pdf && eval "$scanpy plot embed $harmony_plt_embed_opt $harmony_integrate_obj $harmony_integrated_pca_pdf" + + [ "$status" -eq 0 ] + [ -f "$harmony_integrated_pca_pdf" ] +} + +# Do bbknn batch correction, using clustering as batch (just for test purposes) + +@test "Run BBKNN batch integration using clustering as batch" { + if [ "$resume" = 'true' ] && [ -f "$bbknn_obj" ]; then + skip "$bbknn_obj exists and resume is set to 'true'" + fi + + run rm -f $bbknn_obj && eval "$scanpy integrate bbknn $bbknn_opt $louvain_obj $bbknn_obj" + + [ "$status" -eq 0 ] + [ -f "$plt_rank_genes_groups_matrix_pdf" ] +} + +# Do MNN batch correction, using clustering as batch (just for test purposes) + +@test "Run MNN batch integration using clustering as batch" { + if [ "$resume" = 'true' ] && [ -f "$mnn_obj" ]; then + skip "$mnn_obj exists and resume is set to 'true'" + fi + + run rm -f $mnn_obj && eval "$scanpy integrate mnn $mnn_opt $louvain_obj $mnn_obj" + + [ "$status" -eq 0 ] + [ -f "$mnn_obj" ] +} + +# Do ComBat batch correction, using clustering as batch (just for test purposes) + +@test "Run Combat batch integration using clustering as batch" { + if [ "$resume" = 'true' ] && [ -f "$combat_obj" ]; then + skip "$combat_obj exists and resume is set to 'true'" + fi + + run rm -f $combat_obj && eval "$scanpy integrate combat $combat_opt $louvain_obj $combat_obj" + + [ "$status" -eq 0 ] + [ -f "$combat_obj" ] +} + # Local Variables: # mode: sh # End: diff --git a/scanpy_scripts/cli.py b/scanpy_scripts/cli.py index dc7a07d7..b5ed0bfc 100755 --- a/scanpy_scripts/cli.py +++ b/scanpy_scripts/cli.py @@ -30,6 +30,10 @@ PLOT_DOT_CMD, PLOT_MATRIX_CMD, PLOT_HEATMAP_CMD, + HARMONY_INTEGRATE_CMD, + BBKNN_CMD, + MNN_CORRECT_CMD, + COMBAT_CMD, ) @@ -101,6 +105,16 @@ def cluster(): cli.add_command(DPT_CMD) +@cli.group(cls=NaturalOrderGroup) +def integrate(): + """Integrate cells from different experimental batches.""" + +integrate.add_command(HARMONY_INTEGRATE_CMD) +integrate.add_command(BBKNN_CMD) +integrate.add_command(MNN_CORRECT_CMD) +integrate.add_command(COMBAT_CMD) + + @cli.group(cls=NaturalOrderGroup) def plot(): """Visualise data.""" diff --git a/scanpy_scripts/cmd_options.py b/scanpy_scripts/cmd_options.py index 36e8276e..03e16d6b 100644 --- a/scanpy_scripts/cmd_options.py +++ b/scanpy_scripts/cmd_options.py @@ -503,7 +503,23 @@ help='Key for categorical in `.obs`. You can pass your predefined ' 'groups by choosing any categorical annotation of observations.', ), - ] + ], + + 'batch_key': click.option( + '--batch-key', 'key', + type=click.STRING, + required=True, + help='The name of the column in adata.obs that differentiates among ' + 'experiments/batches.' + ), + + 'batch_layer': click.option( + '--layer', '-l', + type=click.STRING, + default=None, + show_default=True, + help="Layer to batch correct. By default corrects the contents of .X." + ), } CMD_OPTIONS = { @@ -697,6 +713,14 @@ help='Cutoffs for the dispersion of expression' 'in the format of "-d min max".', ), + click.option( + '--span', + type=click.FLOAT, + default=0.3, + show_default=True, + help="The fraction of the data (cells) used when estimating the " + "variance in the loess model fit if flavor='seurat_v3'." + ), click.option( '--n-bins', '-b', type=click.INT, @@ -713,7 +737,7 @@ ), click.option( '--flavor', '-v', - type=click.Choice(['seurat', 'cellranger']), + type=click.Choice(['seurat', 'cellranger', 'seurat_v3']), default='seurat', show_default=True, help='Choose the flavor for computing normalized dispersion.', @@ -726,12 +750,16 @@ 'only flag highly-variable genes.', ), click.option( - '--by-batch', '-B', - type=(click.STRING, click.INT), - default=(None, None), - show_default=True, - help='Find highly variable genes within each batch defined by ' - 'then pool and keep those found in at least batches.', + '--batch-key', 'batch_key', + type=click.STRING, + default=None, + help="If specified, highly-variable genes are selected within each " + "batch separately and merged. This simple process avoids the selection of " + "batch-specific genes and acts as a lightweight batch correction method. For all " + "flavors, genes are first sorted by how many batches they are a HVG. For " + "dispersion-based flavors ties are broken by normalized dispersion. If flavor = " + "'seurat_v3', ties are broken by the median (across batches) rank based on " + "within-batch normalized variance." ), ], @@ -1152,6 +1180,21 @@ help='Rank genes by the absolute value of the score, not by the score. ' 'The returned scores are never the absolute values.', ), + click.option( + '--pts', + is_flag=True, + default=False, + show_default=True, + help='Compute the fraction of cells expressing the genes.' + ), + click.option( + '--tie-correct', + is_flag=True, + default=False, + show_default=True, + help="Use tie correction for 'wilcoxon' scores. Used only for " + "'wilcoxon'." + ), click.option( '--filter-params', type=Dictionary(keys=[ @@ -1269,6 +1312,321 @@ ), ], + 'combat': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + COMMON_OPTIONS['batch_key'], + COMMON_OPTIONS['batch_layer'], + click.option( + '--key-added', + type=click.STRING, + default=None, + show_default=True, + help="Key under which to add the computed results. By default a new " + "layer will be created called 'combat', 'combat_{layer}' or " + "'combat_layer_{key_added}' where those parameters were specified. A value of 'X' " + "causes batch-corrected values to overwrite the original content of .X." + ), + click.option( + '--covariates', + type=(CommaSeparatedText()), + default=None, + show_default=True, + help="Comma-separated list of additional covariates besides the " + "batch variable such as adjustment variables or biological condition. This " + "parameter refers to the design matrix X in Equation 2.1 in [Johnson07] and to " + "the mod argument in the original combat function in the sva R package. Note " + "that not including covariates may introduce bias or lead to the removal of " + "biological signal in unbalanced designs." + ), + + ], + + 'harmony': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + COMMON_OPTIONS['batch_key'], + click.option( + '--basis', + type=click.STRING, + default='X_pca', + show_default=True, + help="The name of the field in adata.obsm where the PCA table is " + "stored. Defaults to 'X_pca', which is the default for sc.tl.pca()." + ), + click.option( + '--adjusted-basis', + type=click.STRING, + default='X_pca_harmony', + show_default=True, + help='The name of the field in adata.obsm where the adjusted PCA ' + 'table will be stored after running this function.' + ), + click.option( + '--theta', + type=click.FLOAT, + default=2, + show_default=True, + help='Diversity clustering penalty parameter. theta=0 does not encourage any ' + 'diversity. Larger values of theta result in more diverse clusters.' + ), + click.option( + '--lambda', 'lamb', + type=click.FLOAT, + default=1, + show_default=True, + help='Ridge regression penalty parameter. Lambda must be strictly ' + 'positive. Smaller values result in more aggressive correction.' + ), + click.option( + '--sigma', + type=click.FLOAT, + default=0.1, + show_default=True, + help='Width of soft kmeans clusters. Sigma scales the distance from ' + 'a cell to cluster centroids. Larger values of sigma result in cells assigned to ' + 'more clusters. Smaller values of sigma make soft kmeans cluster approach hard ' + 'clustering.' + ), + click.option( + '--n-clust', 'nclust', + type=click.INT, + default=None, + show_default=False, + help='Number of clusters in model. nclust=1 equivalent to simple ' + 'linear regression.' + ), + click.option( + '--tau', + type=click.INT, + default=0, + show_default=True, + help='Protection against overclustering small datasets with large ones. ' + 'tau is the expected number of cells per cluster.' + ), + click.option( + '--block-size', + type=click.FLOAT, + default=0.05, + show_default=True, + help='What proportion of cells to update during clustering. Between ' + '0 to 1, default 0.05. Larger values may be faster but less accurate.' + ), + click.option( + '--max-iter-cluster', 'max_iter_kmeans', + type=click.INT, + default=20, + show_default=True, + help='Maximum number of rounds to run clustering at each round of ' + 'Harmony.' + ), + click.option( + '--max-iter-harmony', + type=click.INT, + default=10, + show_default=True, + help='Maximum number of rounds to run Harmony. One round of Harmony ' + 'involves one clustering and one correction step.' + ), + click.option( + '--epsilon-cluster', + type=click.FLOAT, + default=1e-5, + show_default=True, + help='Convergence tolerance for clustering round of Harmony Set to ' + '-Inf to never stop early.' + ), + click.option( + '--epsilon-harmony', + type=click.FLOAT, + default=1e-5, + show_default=True, + help='Convergence tolerance for clustering round of Harmony Set to ' + '-Inf to never stop early.' + ), + COMMON_OPTIONS['random_state'], + ], + + 'mnn': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + COMMON_OPTIONS['batch_key'], + COMMON_OPTIONS['batch_layer'], + click.option( + '--key-added', + type=click.STRING, + default=None, + show_default=True, + help="Key under which to add the computed results. By default a new " + "layer will be created called 'mnn', 'mnn_{layer}' or " + "'mnn_layer_{key_added}' where those parameters were specified. A value of 'X' " + "causes batch-corrected values to overwrite the original content of .X." + ), + click.option( + '--var-subset', + type=(click.STRING, CommaSeparatedText()), + multiple=True, + help="The subset of vars (list of str) to be used when performing " + "MNN correction in the format of '--var-subset '. Typically, use " + "the highly variable genes (HVGs) like '--var-subset highly_variable True'. When " + "unset, uses all vars." + ), + click.option( + '--n-neighbors', '-k', + type=CommaSeparatedText(click.INT, simplify=True), + default=20, + show_default=True, + help='Number of mutual nearest neighbors.' + ), + click.option( + '--sigma', + type=click.FLOAT, + default=1.0, + show_default=True, + help='The bandwidth of the Gaussian smoothing kernel used to ' + 'compute the correction vectors.' + ), + click.option( + '--no-cos_norm_in', 'cos_norm_in', + is_flag=True, + default=True, + help='Default behaviour is to perform cosine normalization on the ' + 'input data prior to calculating distances between cells. Use this ' + 'flag to disable that behaviour.' + ), + click.option( + '--no-cos_norm_out', 'cos_norm_out', + is_flag=True, + default=True, + help='Default behaviour is to perform cosine normalization prior to ' + 'computing corrected expression values. Use this flag to disable that ' + 'behaviour.' + ), + click.option( + '--svd-dim', + type=click.INT, + default=None, + show_default=True, + help='The number of dimensions to use for summarizing biological ' + 'substructure within each batch. If not set, biological components ' + 'will not be removed from the correction vectors.' + ), + click.option( + '--no-var-adj', + is_flag=True, + default=True, + help='Default behaviour is to adjust variance of the correction ' + 'vectors. Use this flag to disable that behaviour. Note this step takes most ' + 'computing time.' + ), + click.option( + '--compute-angle', + is_flag=True, + default=False, + help='When set, compute the angle between each cell’s correction ' + 'vector and the biological subspace of the reference batch.' + ), + click.option( + '--svd-mode', + type=click.Choice(['svd', 'rsvd', 'irlb']), + default='rsvd', + show_default=True, + help="'svd' computes SVD using a non-randomized SVD-via-ID " + "algorithm, while 'rsvd' uses a randomized version. 'irlb' performs truncated " + "SVD by implicitly restarted Lanczos bidiagonalization (forked from " + "https://github.com/airysen/irlbpy)." + ), + ], + + 'bbknn': [ + *COMMON_OPTIONS['input'], + *COMMON_OPTIONS['output'], + COMMON_OPTIONS['key_added'], + COMMON_OPTIONS['batch_key'], + click.option( + '--use-rep', '-u', + type=click.STRING, + default='X_pca', + show_default=True, + help='The dimensionality reduction in .obsm to use for neighbour ' + 'detection.' + ), + COMMON_OPTIONS['use_pc'][0], # --n-pcs + click.option( + '--no-approx', 'approx', + is_flag=True, + default=True, + help='Default behaviour is to use annoy’s approximate neighbour ' + 'finding. This results in a quicker run time for large datasets while also ' + 'potentially increasing the degree of batch correction. Use this flag to disable ' + 'that behaviour.', + ), + click.option( + '--metric', '-t', + type=click.Choice(['angular', 'cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']), + default='angular', + show_default=True, + help='A known metric’s name.' + ), + click.option( + '--neighbors-within-batch', + type=click.INT, + default=3, + show_default=True, + help='How many top neighbours to report for each batch; total ' + 'number of neighbours will be this number times the number of batches.' + ), + click.option( + '--trim', + type=click.INT, + default=None, + show_default=True, + help='Trim the neighbours of each cell to these many top ' + 'connectivities. May help with population independence and improve the tidiness ' + 'of clustering. The lower the value the more independent the individual ' + 'populations, at the cost of more conserved batch effect. If None, sets the ' + 'parameter value automatically to 10 times the total number of neighbours for ' + 'each cell. Set to 0 to skip.' + ), + click.option( + '--n-trees', + type=click.INT, + default=10, + show_default=True, + help='Only used when approx=True. The number of trees to construct ' + 'in the annoy forest. More trees give higher precision when querying, at the ' + 'cost of increased run time and resource intensity.' + ), + click.option( + '--no-use-faiss', 'use_faiss', + is_flag=True, + default=True, + help='Default behaviour If approx=False and the metric is ' + '“euclidean”, is to use the faiss package to compute nearest neighbours if ' + 'installed. This improves performance at a minor cost to numerical precision as ' + 'faiss operates on float32. Use this flag to disable that behaviour.' + ), + click.option( + '--set-op-mix-ratio', + type=click.FLOAT, + default=1, + show_default=True, + help='UMAP connectivity computation parameter, float between 0 and ' + '1, controlling the blend between a connectivity matrix formed exclusively from ' + 'mutual nearest neighbour pairs (0) and a union of all observed neighbour ' + 'relationships with the mutual pairs emphasised (1).' + ), + click.option( + '--local-connectivity', + type=click.INT, + default=1, + show_default=True, + help='UMAP connectivity computation parameter, how many nearest ' + 'neighbors of each cell are assumed to be fully connected (and given a ' + 'connectivity value of 1)' + ), + ], + 'embed': [ *COMMON_OPTIONS['input'], *COMMON_OPTIONS['plot'], diff --git a/scanpy_scripts/cmd_utils.py b/scanpy_scripts/cmd_utils.py index 8f7a5665..a2449e07 100644 --- a/scanpy_scripts/cmd_utils.py +++ b/scanpy_scripts/cmd_utils.py @@ -235,9 +235,11 @@ def plot_function( if func_name == 'scatter' or func_name == 'embedding': prefix = kwargs.get('basis', func.__name__) elif kind: - prefix = kind + prefix = kind elif func_name in plot_funcs: prefix = plot_funcs[ func_name ].__name__.split('.')[-1] + if func_name in [ 'sviol', 'rgg_sviol', 'dot', 'rgg_dot', 'matrix', 'rgg_matrix' ]: + prefix = prefix + '_' os.rename( os.path.join(sc.settings.figdir, prefix + figname), output_fig) diff --git a/scanpy_scripts/cmds.py b/scanpy_scripts/cmds.py index e697e69c..b955f68b 100644 --- a/scanpy_scripts/cmds.py +++ b/scanpy_scripts/cmds.py @@ -5,6 +5,7 @@ import os import sys import scanpy as sc +import scanpy.external as sce from .cmd_utils import ( make_subcmd, @@ -25,6 +26,9 @@ from .lib._paga import paga from .lib._diffmap import diffmap from .lib._dpt import dpt +from .lib._bbknn import bbknn +from .lib._mnn import mnn_correct +from .lib._combat import combat LANG = os.environ.get('LANG', None) @@ -210,3 +214,31 @@ arg_desc=_IP_DESC, opt_set='plot_paga' ) + +COMBAT_CMD = make_subcmd( + 'combat', + combat, + cmd_desc='ComBat function for batch effect correction', + arg_desc=_IO_DESC +) + +HARMONY_INTEGRATE_CMD = make_subcmd( + 'harmony', + sce.pp.harmony_integrate, + cmd_desc='Use harmonypy [Korunsky19] to integrate different experiments.', + arg_desc=_IO_DESC, +) + +BBKNN_CMD = make_subcmd( + 'bbknn', + bbknn, + cmd_desc='Batch balanced kNN [Polanski19].', + arg_desc=_IO_DESC, +) + +MNN_CORRECT_CMD = make_subcmd( + 'mnn', + mnn_correct, + cmd_desc='Correct batch effects by matching mutual nearest neighbors [Haghverdi18] [Kang18].', + arg_desc=_IO_DESC, +) diff --git a/scanpy_scripts/lib/_bbknn.py b/scanpy_scripts/lib/_bbknn.py new file mode 100644 index 00000000..a10cb3a0 --- /dev/null +++ b/scanpy_scripts/lib/_bbknn.py @@ -0,0 +1,28 @@ +""" +scanpy external bbknn +""" + +import scanpy.external as sce + +from ..obj_utils import ( + _backup_default_key, + _delete_backup_key, + _rename_default_key, +) + +# Wrapper for bbknn allowing use of non-standard slot + +def bbknn(adata, key=None, key_added=None, **kwargs): + """ + Wrapper function for sce.pp.bbknn(), for supporting non-standard neighbors slot + """ + + _backup_default_key(adata.uns, 'neighbors') + sce.pp.bbknn(adata, batch_key = key, **kwargs) + + if key_added: + _rename_default_key(adata.uns, 'neighbors', f'neighbors_{key_added}') + else: + _delete_backup_key(adata.uns, 'neighbors') + + return adata diff --git a/scanpy_scripts/lib/_combat.py b/scanpy_scripts/lib/_combat.py new file mode 100644 index 00000000..7ef65bea --- /dev/null +++ b/scanpy_scripts/lib/_combat.py @@ -0,0 +1,51 @@ +""" +scanpy combat +""" + +import scanpy as sc + +# Wrapper for mnn allowing use of non-standard slot + +def combat(adata, key=None, key_added=None, layer=None, **kwargs): + """ + Wrapper function for scanpy.pp.combat(), for supporting non-standard slots + """ + + # If layer is set then we have to move the contents of that layer into + # .X for analysis. We back up the original .X, but only if the user hasn't + # specified to overwrite it anyway. + + if layer: + if key_added and key_added != 'X': + adata.layers['X_backup'] = adata.X + + adata.X = adata.layers[layer] + + # If we're storing results in .X (whether from .X or from a layer), run in + # place to save copying objects. + + if key_added and key_added == 'X': + sc.pp.combat(adata, key = key, **kwargs) + + # If we're storing in 'layers' (key_added is not set, or is not X, then + # don't run in place, and put the matrix in the specified layer. + + else: + + cdata = sc.pp.combat(adata, key=key, inplace = False, **kwargs) + + combat_key = 'combat' + if layer: + combat_key = f"{combat_key}_{layer}" + + # If we ran from a layer, restore the .X we had to overwrite + + adata.X = adata.layers['X_backup'] + del adata.layers['X_backup'] + + if key_added: + combat_key = f"{combat_key}_{key_added}" + + adata.layers[combat_key] = cdata + + return adata diff --git a/scanpy_scripts/lib/_hvg.py b/scanpy_scripts/lib/_hvg.py index a4101729..3cd53b73 100644 --- a/scanpy_scripts/lib/_hvg.py +++ b/scanpy_scripts/lib/_hvg.py @@ -9,50 +9,24 @@ def hvg( adata, mean_limits=(0.0125, 3), disp_limits=(0.5, float('inf')), - subset=False, - by_batch=None, **kwargs, ): """ - Wrapper function for sc.highly_variable_genes(), mainly to support searching - by batch and pooling. + Wrapper function for sc.highly_variable_genes() """ # Check for n_top_genes beeing greater than the total genes if 'n_top_genes' in kwargs and kwargs['n_top_genes'] is not None: kwargs['n_top_genes'] = min(adata.n_vars, kwargs['n_top_genes']) - - if by_batch and isinstance(by_batch, (list, tuple)) and by_batch[0]: - batch_name = by_batch[0] - min_n = by_batch[1] - k_hvg = np.zeros(adata.shape[1], dtype=int) - for batch_value in adata.obs[batch_name].unique(): - k_v = adata.obs[batch_name] == batch_value - ad = adata[k_v, :] - k_f, _ = sc.pp.filter_genes(ad, min_cells=3, inplace=False) - hvg = sc.pp.highly_variable_genes( - ad[:, k_f], - min_mean=mean_limits[0], - max_mean=mean_limits[1], - min_disp=disp_limits[0], - max_disp=disp_limits[1], - inplace=False, - **kwargs, - )['highly_variable'] - k_hvg[k_f] += hvg - if subset: - adata = adata[:, k_hvg >= min_n] - else: - adata.var['highly_variable'] = k_hvg >= min_n - else: - sc.pp.highly_variable_genes( - adata, - min_mean=mean_limits[0], - max_mean=mean_limits[1], - min_disp=disp_limits[0], - max_disp=disp_limits[1], - subset=subset, - **kwargs, - ) + + sc.pp.highly_variable_genes( + adata, + min_mean=mean_limits[0], + max_mean=mean_limits[1], + min_disp=disp_limits[0], + max_disp=disp_limits[1], + **kwargs, + ) + return adata diff --git a/scanpy_scripts/lib/_mnn.py b/scanpy_scripts/lib/_mnn.py new file mode 100644 index 00000000..0a559a64 --- /dev/null +++ b/scanpy_scripts/lib/_mnn.py @@ -0,0 +1,85 @@ +""" +scanpy external mnn +""" + +import scanpy.external as sce +import numpy as np +import click + +# Wrapper for mnn allowing use of non-standard slot + +def mnn_correct(adata, key=None, key_added=None, var_subset=None, layer=None, **kwargs): + """ + Wrapper function for sce.pp.mnn_correct(), for supporting non-standard neighbors slot + """ + + # mnn will use .X, so we need to put other layers there for processing + + if layer: + adata.layers['X_backup'] = adata.X + adata.X = adata.layers[layer] + + # mnn_correct() wants batches in separate adatas + + batches = np.unique(adata.obs[key]) + alldata = [] + for batch in batches: + alldata.append( adata[adata.obs[key] == batch,] ) + + # Process var_subset into a list of strings that can be provided to + # mnn_correct() + + if var_subset is not None and len(var_subset) > 0 and var_subset[0] is not None: + + subset = [] + + for name, values in var_subset : + if name in adata.var: + if adata.var[name].dtype == 'bool': + values = [ True if x.lower() == "true" else x for x in values ] + else: + raise click.ClickException(f'Var "{name}" unavailable') + + ind = [ x in values for x in adata.var[name] ] + subset = subset + adata.var.index[ ind ].to_list() + + var_subset = set(subset) + print('Will use %d selected genes for MNN' % len(var_subset)) + + else: + var_subset = None + + # Here's the main bit + + cdata = sce.pp.mnn_correct(*alldata, var_subset = var_subset, do_concatenate = True, index_unique = None, **kwargs) + + # If user has specified key_added = X then they want us to overwrite .X, + # othwerwise copy the .X to a named layer of the original object. In either + # case make sure obs and var are the same as the original. + + if key_added is None or key_added != 'X': + + mnn_key = 'mnn' + if layer: + mnn_key = f"{mnn_key}_{layer}" + + # Layers is set (so we're not storing computed results in the .X, + # and we had to overwrite .X to run mnn), and key_added shows we're + # not storing in the .X, so we need to restore from the backup. + + adata.X = adata.layers['X_backup'] + + if key_added: + mnn_key = f"{mnn_key}_{key_added}" + + adata.layers[mnn_key] = cdata[0][adata.obs.index, adata.var.index].X + + else: + adata.X = cdata[0][adata.obs.index, adata.var.index].X + + # Delete the backup of .X if we needed one + + if layer: + del adata.layers['X_backup'] + + return adata diff --git a/setup.py b/setup.py index 55d620f2..cb4df062 100644 --- a/setup.py +++ b/setup.py @@ -41,12 +41,15 @@ 'matplotlib', 'pandas', 'h5py', - 'scanpy>=1.5.1,<1.6.0', + 'scanpy>=1.6.0', 'louvain', 'leidenalg', 'loompy>=2.0.0,<3.0.0', 'MulticoreTSNE', 'Click', - 'umap-learn<0.4.0' + 'umap-learn<0.4.0', + 'harmonypy>=0.0.5', + 'bbknn>=1.3.12', + 'mnnpy>=0.1.9.5' ], )