diff --git a/AltAnalyze.py b/AltAnalyze.py index 9c94034..914aa3c 100755 --- a/AltAnalyze.py +++ b/AltAnalyze.py @@ -6274,9 +6274,11 @@ def commandLineRun(): customFASTA = None filterFile = None PearsonThreshold = 0.1 - returnCentroids = False + returnCentroids = 'community' runCompleteWorkflow=True k=None + labels=None + original_arguments = sys.argv arguments=[] for arg in original_arguments: @@ -6457,8 +6459,16 @@ def commandLineRun(): else: DE = False elif opt == '--referenceType': - if string.lower(arg) == 'centroid': returnCentroids = True; CenterMethod='centroid' - else: CenterMethod='median'; returnCentroids = True + if string.lower(arg) == 'centroid' or string.lower(arg) == 'mean': + returnCentroids = True; CenterMethod='centroid' + elif string.lower(arg) == 'medoid' or string.lower(arg) == 'median': + returnCentroids = True; CenterMethod='median' + elif string.lower(arg) == 'community' or string.lower(arg) == 'louvain': + returnCentroids = 'community'; CenterMethod='community' + elif string.lower(arg) == 'cells' or string.lower(arg) == 'cell': + returnCentroids = False; CenterMethod='centroid' + else: + returnCentroids = 'community'; CenterMethod='community' elif opt == '--multiThreading' or opt == '--multiProcessing': multiThreading=arg if multiThreading == 'yes': multiThreading = True @@ -7000,6 +7010,7 @@ def commandLineRun(): separateGenePlots = True else: separateGenePlots = False + if opt == '--zscore': if arg=='yes' or arg=='True' or arg == 'true': zscore=True @@ -8036,6 +8047,7 @@ def commandLineRun(): elif opt == '--adjp': adjp = arg elif opt == '--performDiffExp': performDiffExp = arg elif opt == '--centerMethod': CenterMethod = arg + elif opt == '--labels': labels = arg fl = UI.ExpressionFileLocationData('','','','') fl.setSpecies(species) fl.setVendor(manufacturer) @@ -8049,6 +8061,7 @@ def commandLineRun(): fl.setUseAdjPvalue(adjp) fl.setPvalThreshold(pval) fl.setFoldCutoff(FoldDiff) + fl.setLabels(labels) else: fl.setClassificationAnalysis('LineageProfiler') #fl.setCompendiumType('AltExon') @@ -8063,9 +8076,8 @@ def commandLineRun(): ICGS_files.append(input_file) import LineageProfilerIterate print 'center method =',CenterMethod - try: LineageProfilerIterate.createMetaICGSResults(ICGS_files,output_dir,CenterMethod =CenterMethod,species=species,PearsonThreshold=PearsonThreshold) - except: - LineageProfilerIterate.createMetaICGSResults(ICGS_files,output_dir,CenterMethod=CenterMethod) + LineageProfilerIterate.createMetaICGSResults(ICGS_files,output_dir,CenterMethod =CenterMethod,species=species,PearsonThreshold=PearsonThreshold) + #except: LineageProfilerIterate.createMetaICGSResults(ICGS_files,output_dir,CenterMethod=CenterMethod) sys.exit() try: CenterMethod=CenterMethod except: CenterMethod='centroid' diff --git a/Config/default-files.csv b/Config/default-files.csv index 2f070a1..4093285 100755 --- a/Config/default-files.csv +++ b/Config/default-files.csv @@ -3,11 +3,11 @@ "exon_seq","","HuEx-1_0-st-v2.hg16.probeset.fa","Hs" "exon_seq","","MoEx-1_0-st-v1.mm5.probeset.fa","Mm" "exon_seq","","RaEx-1_0-st-v1.rn3.probeset.fa","Rn" -"PathDir","local","/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/cellHarmony-evaluation/Grimes","all" +"PathDir","local","/Users/saljh8/Downloads","all" "temp","temp","ftp://ftp.geneontology.org/pub/go/ontology-archive/function.ontology.2008-08-01.gz","all" "Program/Download","Status","Location","Species" "url","url","http://altanalyze.org/archiveDBs/","all" -"PathFile","local","/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/cellHarmony-evaluation/Grimes","all" +"PathFile","local","/Volumes/salomonis2/HCA-Immune-10x-data/Bone-Marrow/MantonBM5/cellHarmony/heatmaps","all" "TrEMBL","ftp","ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_trembl_human.dat.gz","Hs" "TrEMBL","ftp","ftp://ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/uniprot_trembl_rodents.dat.gz","Mm|Rn" "APT","local","AltDatabase/affymetrix/APT","all" diff --git a/Config/options.txt b/Config/options.txt index e9227b8..6c94a3c 100755 --- a/Config/options.txt +++ b/Config/options.txt @@ -1 +1,186 @@ -Option Mac-Displayed option Display object Group Notes Descriptions Global defaults exon AltMouse gene 3'array junction RNASeq 10XGenomics dbase_version Select database version comboBox ArrayType manufacturer_selection Select vendor/data type comboBox ArrayType species Select species comboBox ArrayType array_type Select platform comboBox ArrayType --- --- --- --- --- --- --- update_dbs """Get new species/vendor/array databases online """ single-checkbox ArrayType --- --- --- --- --- --- --- run_from_scratch """Analysis options """ button AnalysisType Process CEL files|Process Expression file|Process AltAnalyze filtered|Annotate External Results|Interactive Result Viewer|Additional Analyses Process CEL files|Process Expression file|Process AltAnalyze filtered|Interactive Result Viewer|Additional Analyses Process CEL files|Process Expression file|Process AltAnalyze filtered|Annotate External Results|Interactive Result Viewer|Additional Analyses Process CEL files|Process Expression file|Interactive Result Viewer|Additional Analyses Process CEL files|Process Expression file|Process AltAnalyze filtered|Interactive Result Viewer|Additional Analyses Process RNA-seq reads|Process Expression file|Process AltAnalyze filtered|Interactive Result Viewer|Additional Analyses Process Chromium Matrix|Process Expression file|Interactive Result Viewer|Additional Analyses selected_version """Select database version """ comboBox OnlineDatabases --- --- --- --- --- --- --- selected_species1 """Select species """ comboBox OnlineDatabases --- --- --- --- --- --- --- selected_species2 """Select species """ comboBox OnlineDatabases --- --- --- --- --- --- --- selected_species3 """Select species """ comboBox OnlineDatabases --- --- --- --- --- --- --- update_goelite_resources Download/update all gene-set analysis databases single-checkbox OnlineDatabases yes|no yes|no yes|no yes|no yes|no yes|no yes|no additional_analyses """Additional options """ button Additional Analyses Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files inputIDs (optional) Analyze these gene symbols enter InputGOEliteDirs --- --- --- --- --- --- --- criterion_input_folder (optional) Select GO-Elite Input file directory folder InputGOEliteDirs --- --- --- --- --- --- --- criterion_denom_folder (optional) Select GO-Elite Denominator file directory folder InputGOEliteDirs --- --- --- --- --- --- --- main_output_folder Select GO-Elite output directory folder InputGOEliteDirs "note: by default, the output will be stored in a set of new directories under\kthe same directory as the input ID folder." --- --- --- --- --- --- --- new_run Additional Options button AdditionalOptions Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run dataset_name Give a name to this dataset enter InputCELFiles --- --- --- --- --- --- --- input_cel_dir Select the CEL file containing folder folder InputCELFiles --- --- --- --- --- --- --- input_fastq_dir (optional) Select fastq files to run in Kallisto folder InputCELFiles NA NA NA NA NA --- NA output_CEL_dir Select an AltAnalyze result output directory folder InputCELFiles --- --- --- --- --- --- --- multithreading Use multithreading for read genomic annotation comboBox InputCELFiles yes NA NA NA NA NA yes|no NA build_exon_bedfile Build exon coordinate bed file to obtain BAM file exon counts\k(see the online tutorial for additional details and information) single-checkbox InputCELFiles NA NA NA NA NA --- NA channel_to_extract Extract data from the following channels comboBox InputCELFiles NA NA NA green|red|green/red ratio|red/green ratio NA NA NA remove_xhyb Remove probesets that have large cross-hybridization scores single-checkbox InputCELFiles --- --- NA NA NA NA NA input_cdf_file Select the PGF library file for your array (required) file InputLibraryFiles note: the PGF file is apart of the standard library files for this array. This\kdirectory needs to also contain the CLF and BGP files for the array. These\kfiles can be downloaded from the Affymetrix website. --- --- --- --- --- --- --- input_annotation_file Select the CSV NetAffx annotation file for your array (recommended) file InputLibraryFiles "note: the CSV annotation file should be listed under ""Current NetAffx\kannotations"" for this array." --- --- --- --- --- --- --- input_exp_file Select a probe set expression file (required) file InputExpFiles --- --- --- --- --- --- --- input_stats_file Select a probe set p-value file (optional) file InputExpFiles "note: if not selected, an appropriate statistic file in the selected probe set\kexpression file directory will be used." --- --- --- NA --- NA NA output_dir Select an AltAnalyze result output directory folder InputExpFiles "note: by default, the output will be stored in a set of new directories under\kthe same directory as the input expression file.\k\kWhen analyzing already processed AltAnalyze outputs, select the file with\kthe prefix ""exp."" in the folder ""ExpressionInput""." --- --- --- --- --- --- --- dabg_p Remove probesets with a DABG p-value above enter GeneExpression Maximum average DABG p-value (applied to one or both of the compared biological groups) for ExpressionBuilder filtering. --- --- --- NA --- NA NA rpkm_threshold Remove genes with an RPKM less than enter GeneExpression NA NA NA NA NA --- --- gene_exp_threshold Remove genes expressed below (non-log) enter GeneExpression Maximum average non-log expression value (applied to one or both of the compared biological groups) for ExpressionBuilder filtering. NA NA NA NA NA --- NA exon_exp_threshold Remove exons expressed below (non-log) enter GeneExpression Maximum average non-log expression value (applied to one or both of the compared biological groups) for ExpressionBuilder filtering. NA NA NA NA NA --- NA exon_rpkm_threshold Remove exons with an RPKM less than enter GeneExpression NA NA NA NA NA --- NA expression_threshold Remove probesets expressed below (non-log) enter GeneExpression Maximum average non-log expression value (applied to one or both of the compared biological groups) for ExpressionBuilder filtering. --- --- --- NA --- --- NA perform_alt_analysis Perform alternative exon analysis comboBox GeneExpression Indicates whether to just export the gene expression summary or in addition perform an alternative exon analysis. yes|just expression yes|just expression yes|just expression NA yes|just expression yes|just expression NA analyze_as_groups Organize samples into groups comboBox GeneExpression Indicates whether AltAnalyze should peform group comparisons or just export interim results and skip filtering steps. Will also use metaprobesets as opposed to agglomerating information from filtered probesets for gene expression calculation. yes|only export sample data NA yes|only export sample data NA yes|only export sample data yes|only export sample data NA expression_data_format Expression data format comboBox GeneExpression Format the user data is in (typically log2 expression values). log|non-log log|non-log log|non-log log|non-log log|non-log log|non-log log|non-log normalize_feature_exp Normalize exon/junction expression comboBox GeneExpression NA NA NA NA NA RPKM|none NA normalize_gene_data Normalize feature expression comboBox GeneExpression NA NA NA quantile|group|None NA NA quantile|group|None avg_all_for_ss Determine gene expression levels using comboBox GeneExpression "Indicates whether to average the expression all probesets, as opposed to constitutive only, to calculate a gene expression value for an associated gene." constitutive probesets|core probesets NA constitutive probesets|core probesets NA constitutive probesets|known exons constitutive exons|expressed exons|known exons|known junctions NA include_raw_data Include replicate experiment values in export comboBox GeneExpression "Whether or not to include replicate data in the ExpressionBuilder gene expression export file, as opposed to summary statistics (average, t-test p, folds, etc.)." yes|no yes|no yes|no yes|no yes|no yes|no yes|no probability_algorithm Comparison group test statistic comboBox GeneExpression moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums FDR_statistic False discovery rate statistic comboBox GeneExpression Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value batch_effects Remove batch effects (Combat) comboBox GeneExpression yes|no yes|no yes|no yes|no yes|no yes|no yes|no marker_finder Predict condition-specific biomarkers comboBox GeneExpression yes|no yes|no yes|no yes|no yes|no yes|no yes|no visualize_results Perform expression clustering and visual QC comboBox GeneExpression yes|no yes|no yes|no yes|no yes|no yes|no yes|no run_lineage_profiler Perform cell profiling with LineageProfiler comboBox GeneExpression yes|no yes|no yes|no yes|no yes|no yes|no yes|no run_goelite Analyze ontologies and pathways with GO-Elite comboBox GeneExpression run immediately|decide later run immediately|decide later run immediately|decide later run immediately|decide later run immediately|decide later run immediately|decide later run immediately|decide later input_filtered_dir Select a prior generated AltAnalyze results project folder folder InputFilteredFiles note: all files in this directory should be built by AltAnalyze from a prior analysis. "note: if not selected, all filtered probe set expression files in\k'AltExpression//'\kwill be used and results written to 'AltResults/AlternativeOutput'.\k" --- --- --- NA --- --- NA input_external_dir Select a directory with lists of alternative probesets and scores. folder InputExternalFiles "note: all files in this directory should contain probe set IDs and scores\k\k(e.g., FIRMA and MADS results). Files should have at least 3 columns\k(1) probe set ID, (2) normalized fold change and (3) splicing p-value." --- --- --- NA --- --- NA analysis_method Select the alternative exon algorithm comboBox AltAnalyze Alternative exon analysis method to apply to user data. splicing-index|FIRMA ASPIRE|linearregres splicing-index|FIRMA NA ASPIRE|linearregres|none MultiPath-PSI|ASPIRE|linearregres|none NA additional_algorithms Individual probeset analysis method comboBox AltAnalyze NA NA NA NA splicing-index|FIRMA|none splicing-index|none NA filter_probe_types Select probe sets to include comboBox AltAnalyze Different options for which sets of probe sets to include in the alternative exon analysis. core|extended|full all|exons-only|junctions-only|combined-junctions NA NA all|combined-junctions all|combined-junctions NA analyze_all_conditions Type of group comparisons to perform comboBox AltAnalyze Different options for which sets of probe sets to include in the alternative exon analysis. pairwise|all groups|both pairwise|all groups|both pairwise|all groups|both NA pairwise|all groups|both pairwise|all groups|both NA p_threshold Max MiDAS/normalized intensity p-value enter AltAnalyze "Maximum user defined p-value to filter constitutive corrected alternative exon analysis t-test. For AltMouse analyses, only one of the two probe sets is required to meet this threshold." --- --- --- NA --- --- NA alt_exon_fold_cutoff Minimum alternative exon score enter AltAnalyze Fold or ASPIRE minimal score for inclusion with results from the alternative exon analysis (non-log). --- --- --- NA --- --- NA additional_score Individual probeset min alt. exon score enter AltAnalyze Fold or ASPIRE minimal score for inclusion with results from the alternative exon analysis (non-log). NA --- NA NA --- --- NA permute_p_threshold Maximum reciprocal junction p-value enter AltAnalyze Maximum allowed permutation p-value. NA --- NA NA --- --- NA gene_expression_cutoff Maximum absolute gene-expression change enter AltAnalyze Absolute maximum gene-expression fold change allowed between pairwise-comparisons for alternative exon analysis (otherwise gene is excluded). --- --- --- NA --- --- NA remove_intronic_junctions Remove novel-intron splice junctions comboBox AltAnalyze "Indicates whether to remove junctions with splice-site aligning to intron regions only (e.g., I1.1_1234-I1.1_1256)" NA NA NA NA NA yes|no NA perform_permutation_analysis Perform permutation analysis comboBox AltAnalyze Indicates whether to perform permutation analysis of ASPIRE or linearegress scores to calculate a statistical likelihood value. NA yes|no NA NA yes|no yes|no NA export_splice_index_values Export all normalized intensities single-checkbox AltAnalyze Export raw constitutive corrected probe set expression values for replicates. Useful for expression clustering of alternative exon changes. yes|no yes|no yes|no NA yes|no yes|no NA run_MiDAS Calculate MiDAS p-values single-checkbox AltAnalyze Export re-formatted input for analysis in MiDAS through an external application (Affymetrix Power Tools). yes|no NA yes|no NA yes|no yes|no NA calculate_splicing_index_p Calculate normalized intensity p-values single-checkbox AltAnalyze yes|no NA yes|no NA NA NA NA filter_for_AS Filter results for predicted AS single-checkbox AltAnalyze yes|no yes|no yes|no NA yes|no yes|no NA analyze_functional_attributes Align probesets to protein domains using comboBox AltAnalyze direct-alignment|inferred comparison direct-alignment|inferred comparison direct-alignment|inferred comparison NA direct-alignment|inferred comparison direct-alignment|inferred comparison NA microRNA_prediction_method # of agreeing miRNA databases needed comboBox AltAnalyze Include binding sites only present in multiple analyzed micoRNA target databases or in at least one databases. one|two or more one|two or more one|two or more NA one|two or more one|two or more NA pick_filtering_options Filter results by which algorithms multiple-checkbox Advanced SI fold|SI p-value|MiDAS p-value NA NA NA NA NA NA avg_all_for_ss_for_AS Use known exons probesets\k to derive gene expression\k versus constitutive only radio Advanced constitutive probesets|known exons constitutive probesets|known exons NA NA constitutive probesets|known exons constitutive exons|known exons NA new_species_code Two letter species code (e.g. Hs) enter NewSpecies --- --- --- --- --- --- --- new_species_name Species name (e.g. Homo sapiens) enter NewSpecies --- --- --- --- --- --- --- new_manufacturer Choose vendor comboBox NewSpecies --- --- --- --- --- --- --- allowed_array_systems Choose array or data-type comboBox NewSpecies2 --- --- --- --- --- --- --- ge_fold_cutoffs Minimum gene expression fold change enter GOElite --- --- --- --- --- --- --- ge_pvalue_cutoffs Maximum gene expression ttest p-value enter GOElite --- --- --- --- --- --- --- ge_ptype Filter based on the following p-value comboBox GOElite rawp|adjp rawp|adjp rawp|adjp rawp|adjp rawp|adjp rawp|adjp rawp|adjp filter_method Prune Ontology terms using comboBox GOElite z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z_threshold Z-score cutoff for initial filtering (> 0) enter GOElite --- --- --- --- --- --- --- p_val_threshold Enter permuted p-value cutoff (between 0-1) enter GOElite --- --- --- --- --- --- --- change_threshold Enter minimum number of changed genes enter GOElite --- --- --- --- --- --- --- ORA_algorithm Select the algorithm to use for ORA comboBox GOElite Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test resources_to_analyze Only analyze the following resource(s) comboBox GOElite all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology pathway_permutations Number of permutations for ORA enter GOElite --- --- --- --- --- --- --- mod Select primary relational gene system comboBox GOElite Ensembl EntrezGene Ensembl Ensembl|EntrezGene|HMDB Ensembl Ensembl Ensembl|EntrezGene|HMDB returnPathways Visualize all over-represented WikiPathways comboBox GOElite note: will add ~1 minute per pathway to be visualized yes|no yes|no yes|no yes|no yes|no yes|no yes|no get_additional Download/update additional resources comboBox GOElite None None None None None None None inputIDs (optional) Analyze these gene symbols enter InputGOEliteFiles --- --- --- --- --- --- --- elite_input_dir (optional) Select a directory of GO-Elite formatted input file(s). folder InputGOEliteFiles --- --- --- --- --- --- --- elite_denom_dir (optional) Select a directory of GO-Elite formatted denominator file(s). folder InputGOEliteFiles --- --- --- --- --- --- --- elite_output_dir Select an GO-Elite output directory folder InputGOEliteFiles "note: by default, the output will be stored in a set of new directories under\kthe same directory as the input expression file." --- --- --- --- --- --- --- input_cluster_file Select the tab-delimited expression file for clustering file heatmap note: the expression file must have an annotation row and annotation column.\k Log2 values recommended. Results saved to the folder 'DataPlots'.\k --- --- --- --- --- --- --- column_metric Select the column clustering metric comboBox heatmap http://docs.scipy.org/doc/scipy/reference/spatial.distance.html cosine braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule column_method Select the column clustering method comboBox heatmap http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach row_metric Select the row clustering metric comboBox heatmap http://docs.scipy.org/doc/scipy/reference/spatial.distance.html cosine braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule row_method Select the row clustering method comboBox heatmap http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html weighted average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach color_selection Choose a color scheme comboBox heatmap note: colors are indicated as up-null-down http://matplotlib.sourceforge.net/examples/pylab_examples/show_colormaps.html red-black-sky red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys cluster_rows Cluster rows comboBox heatmap yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no cluster_columns Cluster columns comboBox heatmap yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no normalization Normalize rows relative to comboBox heatmap row median NA|row mean|row median NA|row mean|row median NA|row mean|row median NA|row mean|row median NA|row mean|row median NA|row mean|row median NA|row mean|row median transpose Transpose matrix comboBox heatmap no yes|no yes|no yes|no yes|no yes|no yes|no yes|no contrast Heatmap color contrast level (scaling factor) enter heatmap 3 3 3 3 3 3 3 GeneSetSelection (optional) Select GeneSet/Ontology to filter comboBox heatmap PathwaySelection (optional) Select a specific GeneSet multiple-comboBox heatmap OntologyID (optional) Type a pathway ID or Ontology ID enter heatmap --- --- --- --- --- --- --- GeneSelection (optional) Type a gene to get top correlated enter heatmap --- --- --- --- --- --- --- JustShowTheseIDs (optional) Display only selected gene IDs enter heatmap --- --- --- --- --- --- --- CorrelationCutoff (optional) Get all correlated genes with rho > enter heatmap --- --- --- --- --- --- --- HeatmapAdvanced Additional correlation options multiple-comboBox heatmap None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle ClusterGOElite (optional) Perform GeneSet cluster enrichment comboBox heatmap heatmapGeneSets (optional) Store filtered genes for later enter heatmap --- --- --- --- --- --- --- input_cluster_file Select the tab-delimited expression file for clustering file PCA note: the expression file must have an annotation row and annotation column.\k Log2 values recommended. Results saved to the folder 'DataPlots'.\k --- --- --- --- --- --- --- pca_labels Display sample labels next to each object comboBox PCA yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no transpose Transpose matrix comboBox PCA no yes|no yes|no yes|no yes|no yes|no yes|no yes|no zscore Z-score normalize comboBox PCA yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no reimportModelScores Re-import prior t-SNE plot coordinates comboBox PCA yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no pca_algorithm Algorithm to use comboBox PCA SVD SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP dimensions Dimensions to display comboBox PCA 3D 2D|3D 2D|3D 2D|3D 2D|3D 2D|3D 2D|3D 2D|3D colorByGene (Optional) Enter a gene to color the PCA by enter PCA --- --- --- --- --- --- --- pcaGeneSets (Optional) Store top 200 component genes as enter PCA --- --- --- --- --- --- --- input_lineage_file Select the tab-delimited expression file for cell classification file LineageProfiler Recommended: Select an un-filtered log2 expression file (query) --- --- --- --- --- --- --- classificationAnalysis Analysis to perform comboBox LineageProfiler cellHarmony cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler markerFinder_file Select an ICGS or MarkerFinder reference classification file file LineageProfiler Recommended: Any heatmap text output file from AltAnalyze (reference) --- --- --- --- --- --- --- performDiffExp Perform differential expression analysis comboBox LineageProfiler yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no returnCentroids Align to cluster centroid instead of cell comboBox LineageProfiler centroid centroid|cell centroid|cell centroid|cell centroid|cell centroid|cell centroid|cell centroid|cell PearsonThreshold Enter cellHarmony Pearson correlation threshold enter LineageProfiler 0.4 --- --- --- --- --- --- --- FoldCutoff Enter differential expression fold threshold enter LineageProfiler 1.5 --- --- --- --- --- --- --- pvalThreshold Enter differential expression p-value threshold enter LineageProfiler 0.05 --- --- --- --- --- --- --- UseAdjPval Filter by adjusted rather than raw p-value comboBox LineageProfiler no yes|no yes|no yes|no yes|no yes|no yes|no yes|no geneModel_file (LineageProflier) Optional restricted marker models file LineageProfiler Note: These options are not applicable for cellHarmony --- --- --- --- --- --- --- compendiumType (LineageProflier) Data type to evaluate comboBox LineageProfiler protein_coding protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon compendiumPlatform (LineageProflier) Reference platform comboBox LineageProfiler exon exon|gene|3'array exon|gene|3'array exon|gene|3'array exon|gene|3'array exon|gene|3'array exon|gene|3'array exon|gene|3'array modelDiscovery (LineageProflier) Iterative model discovery comboBox LineageProfiler no yes|no yes|no yes|no yes|no yes|no yes|no yes|no input_data_file Select the tab-delimited expression file for ID Translation file IDConverter --- --- --- --- --- --- --- input_source Select the file ID system (first column) comboBox IDConverter --- --- --- --- --- --- --- output_source Select the ID system to add to this file comboBox IDConverter --- --- --- --- --- --- --- input_file1 Select the first file to merge file MergeFiles --- --- --- --- --- --- --- input_file2 Select the second file to merge file MergeFiles --- --- --- --- --- --- --- input_file3 Select the third file to merge (optional) file MergeFiles --- --- --- --- --- --- --- input_file4 Select the fourth file to merge (optional) file MergeFiles --- --- --- --- --- --- --- join_option Join files based on their comboBox MergeFiles Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union ID_option Only return one-to-one ID relationships comboBox MergeFiles False|True False|True False|True False|True False|True False|True False|True output_merge_dir Select the folder to save the merged file folder MergeFiles --- --- --- --- --- --- --- venn_input_file1 Select the first file to examine file VennDiagram --- --- --- --- --- --- --- venn_input_file2 Select the second file to examine file VennDiagram --- --- --- --- --- --- --- venn_input_file3 Select the third file to examine (optional) file VennDiagram --- --- --- --- --- --- --- venn_input_file4 Select the fourth file to examine (optional) file VennDiagram --- --- --- --- --- --- --- venn_output_dir Select the folder to save the examined file folder VennDiagram --- --- --- --- --- --- --- altanalyze_results_folder Select the AltAnalyze AltResults folder folder AltExonViewer --- --- --- --- --- --- --- data_type Type of exon-level data to visualize comboBox AltExonViewer raw expression raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index show_introns Show introns in addition to exons comboBox AltExonViewer no no|yes no|yes no|yes no|yes no|yes no|yes no|yes gene_symbol Enter one or more gene symbols (space delimited) enter AltExonViewer --- --- --- --- --- --- --- analysisType View as: comboBox AltExonViewer graph-plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot altgenes_file (optional) Select a file with the first column as genes file AltExonViewer --- --- --- --- --- --- --- Genes_network Option 1: Build network from entered IDs enter network --- --- --- --- --- --- --- input_ID_file Option 2: Build network from input ID file (or SIF) file network --- --- --- --- --- --- --- GeneSetSelection_network Option 3: Build network from GeneSet Type comboBox network --- --- --- --- --- --- --- inputType_network (Option 2) File type comboBox network ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF PathwaySelection_network (Option 3) Select a specific GeneSet multiple-comboBox network --- --- --- --- --- --- --- OntologyID_network (Option 3) Type a pathway ID or Ontology ID enter network --- --- --- --- --- --- --- interactionDirs Select one or more interaction databases multiple-comboBox network WikiPathways|KEGG|BioGRID|TFTargets|common-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank degrees Select interactions of the following type comboBox network direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path update_interactions Download/update interaction databases comboBox network no yes|no yes|no yes|no yes|no yes|no yes|no yes|no elite_exp_file (optional) Select an input expression file file network note: File should be in GO-Elite format (see GO-Elite help) --- --- --- --- --- --- --- includeExpIDs_network Prioritize IDs from expression file in network comboBox network yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no output_net_folder Save results to folder network --- --- --- --- --- --- --- ExpressionCutoff "Gene expression (TPM, RPKM) cutoff" enter PredictGroups 1 --- --- --- --- --- --- --- CountsCutoff Gene read-count cutoff (not always available) enter PredictGroups 0 --- --- --- --- --- --- --- FoldDiff Fold change filter cutoff enter PredictGroups 4 --- --- --- --- --- --- --- rho_cutoff Minimum Pearson correlation cutoff enter PredictGroups 0.2 --- --- --- --- --- --- --- SamplesDiffering Minimum number of cells differing enter PredictGroups 4 --- --- --- --- --- --- --- dynamicCorrelation ICGS will identify an optimal correlation cutoff comboBox PredictGroups yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no removeOutliers Remove low expression outlier samples comboBox PredictGroups no yes|no yes|no yes|no yes|no yes|no yes|no yes|no featuresToEvaluate Features to evaluate comboBox PredictGroups Genes Genes|AltExon|Both Genes|AltExon|Both Genes|AltExon|Both Genes Genes|AltExon|Both Genes|AltExon|Both Genes restrictBy Restrict genes to protein coding comboBox PredictGroups yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no column_metric_predict Select the column clustering metric comboBox PredictGroups http://docs.scipy.org/doc/scipy/reference/spatial.distance.html cosine braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule column_method_predict Select the column clustering method comboBox PredictGroups http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach k (optional) number of user-defined ICGS clusters (k) enter PredictGroups --- --- --- --- --- --- --- GeneSelectionPredict (optional) Enter genes to build clusters from (guides) enter PredictGroups --- --- --- --- --- --- --- GeneSetSelectionPredict (optional) Or select guide GeneSet/Ontology comboBox PredictGroups PathwaySelectionPredict (optional) Select guide specific GeneSet(s) multiple-comboBox PredictGroups JustShowTheseIDsPredict (optional) Display selected gene IDs in results enter PredictGroups --- --- --- --- --- --- --- excludeCellCycle (optional) Eliminate cell cycle effects comboBox PredictGroups no conservative|stringent|no conservative|stringent|no conservative|stringent|no conservative|stringent|no conservative|stringent|no conservative|stringent|no conservative|stringent|no \ No newline at end of file +Option Mac-Displayed option Display object Group Notes Descriptions Global defaults exon AltMouse gene 3'array junction RNASeq 10XGenomics +dbase_version Select database version comboBox ArrayType +manufacturer_selection Select vendor/data type comboBox ArrayType +species Select species comboBox ArrayType +array_type Select platform comboBox ArrayType --- --- --- --- --- --- --- +update_dbs """Get new species/vendor/array databases online """ single-checkbox ArrayType --- --- --- --- --- --- --- +run_from_scratch """Analysis options """ button AnalysisType Process CEL files|Process Expression file|Process AltAnalyze filtered|Annotate External Results|Interactive Result Viewer|Additional Analyses Process CEL files|Process Expression file|Process AltAnalyze filtered|Interactive Result Viewer|Additional Analyses Process CEL files|Process Expression file|Process AltAnalyze filtered|Annotate External Results|Interactive Result Viewer|Additional Analyses Process CEL files|Process Expression file|Interactive Result Viewer|Additional Analyses Process CEL files|Process Expression file|Process AltAnalyze filtered|Interactive Result Viewer|Additional Analyses Process RNA-seq reads|Process Expression file|Process AltAnalyze filtered|Interactive Result Viewer|Additional Analyses Process Chromium Matrix|Process Expression file|Interactive Result Viewer|Additional Analyses +selected_version """Select database version """ comboBox OnlineDatabases --- --- --- --- --- --- --- +selected_species1 """Select species """ comboBox OnlineDatabases --- --- --- --- --- --- --- +selected_species2 """Select species """ comboBox OnlineDatabases --- --- --- --- --- --- --- +selected_species3 """Select species """ comboBox OnlineDatabases --- --- --- --- --- --- --- +update_goelite_resources Download/update all gene-set analysis databases single-checkbox OnlineDatabases yes|no yes|no yes|no yes|no yes|no yes|no yes|no +additional_analyses """Additional options """ button Additional Analyses Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files Pathway Enrichment|Pathway Visualization|Hierarchical Clustering|Dimensionality Reduction|Network Visualization|Identifier Translation|Cell Classification|AltExon Viewer|Venn Diagram|Merge Files +inputIDs (optional) Analyze these gene symbols enter InputGOEliteDirs --- --- --- --- --- --- --- +criterion_input_folder (optional) Select GO-Elite Input file directory folder InputGOEliteDirs --- --- --- --- --- --- --- +criterion_denom_folder (optional) Select GO-Elite Denominator file directory folder InputGOEliteDirs --- --- --- --- --- --- --- +main_output_folder Select GO-Elite output directory folder InputGOEliteDirs "note: by default, the output will be stored in a set of new directories under\kthe same directory as the input ID folder." --- --- --- --- --- --- --- +new_run Additional Options button AdditionalOptions Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run Perform GO/Pathway Analysis on Results|Change Parameters and Re-Run +dataset_name Give a name to this dataset enter InputCELFiles --- --- --- --- --- --- --- +input_cel_dir Select the CEL file containing folder folder InputCELFiles --- --- --- --- --- --- --- +input_fastq_dir (optional) Select fastq files to run in Kallisto folder InputCELFiles NA NA NA NA NA --- NA +output_CEL_dir Select an AltAnalyze result output directory folder InputCELFiles --- --- --- --- --- --- --- +multithreading Use multithreading for read genomic annotation comboBox InputCELFiles yes NA NA NA NA NA yes|no NA +build_exon_bedfile Build exon coordinate bed file to obtain BAM file exon counts\k(see the online tutorial for additional details and information) single-checkbox InputCELFiles NA NA NA NA NA --- NA +channel_to_extract Extract data from the following channels comboBox InputCELFiles NA NA NA green|red|green/red ratio|red/green ratio NA NA NA +remove_xhyb Remove probesets that have large cross-hybridization scores single-checkbox InputCELFiles --- --- NA NA NA NA NA +input_cdf_file Select the PGF library file for your array (required) file InputLibraryFiles note: the PGF file is apart of the standard library files for this array. This\kdirectory needs to also contain the CLF and BGP files for the array. These\kfiles can be downloaded from the Affymetrix website. --- --- --- --- --- --- --- +input_annotation_file Select the CSV NetAffx annotation file for your array (recommended) file InputLibraryFiles "note: the CSV annotation file should be listed under ""Current NetAffx\kannotations"" for this array." --- --- --- --- --- --- --- +input_exp_file Select a probe set expression file (required) file InputExpFiles --- --- --- --- --- --- --- +input_stats_file Select a probe set p-value file (optional) file InputExpFiles "note: if not selected, an appropriate statistic file in the selected probe set\kexpression file directory will be used." --- --- --- NA --- NA NA +output_dir Select an AltAnalyze result output directory folder InputExpFiles "note: by default, the output will be stored in a set of new directories under\kthe same directory as the input expression file.\k\kWhen analyzing already processed AltAnalyze outputs, select the file with\kthe prefix ""exp."" in the folder ""ExpressionInput""." --- --- --- --- --- --- --- +dabg_p Remove probesets with a DABG p-value above enter GeneExpression Maximum average DABG p-value (applied to one or both of the compared biological groups) for ExpressionBuilder filtering. --- --- --- NA --- NA NA +rpkm_threshold Remove genes with an RPKM less than enter GeneExpression NA NA NA NA NA --- --- +gene_exp_threshold Remove genes expressed below (non-log) enter GeneExpression Maximum average non-log expression value (applied to one or both of the compared biological groups) for ExpressionBuilder filtering. NA NA NA NA NA --- NA +exon_exp_threshold Remove exons expressed below (non-log) enter GeneExpression Maximum average non-log expression value (applied to one or both of the compared biological groups) for ExpressionBuilder filtering. NA NA NA NA NA --- NA +exon_rpkm_threshold Remove exons with an RPKM less than enter GeneExpression NA NA NA NA NA --- NA +expression_threshold Remove probesets expressed below (non-log) enter GeneExpression Maximum average non-log expression value (applied to one or both of the compared biological groups) for ExpressionBuilder filtering. --- --- --- NA --- --- NA +perform_alt_analysis Perform alternative exon analysis comboBox GeneExpression Indicates whether to just export the gene expression summary or in addition perform an alternative exon analysis. yes|just expression yes|just expression yes|just expression NA yes|just expression yes|just expression NA +analyze_as_groups Organize samples into groups comboBox GeneExpression Indicates whether AltAnalyze should peform group comparisons or just export interim results and skip filtering steps. Will also use metaprobesets as opposed to agglomerating information from filtered probesets for gene expression calculation. yes|only export sample data NA yes|only export sample data NA yes|only export sample data yes|only export sample data NA +expression_data_format Expression data format comboBox GeneExpression Format the user data is in (typically log2 expression values). log|non-log log|non-log log|non-log log|non-log log|non-log log|non-log log|non-log +normalize_feature_exp Normalize exon/junction expression comboBox GeneExpression NA NA NA NA NA RPKM|none NA +normalize_gene_data Normalize feature expression comboBox GeneExpression NA NA NA quantile|group|None NA NA quantile|group|None +avg_all_for_ss Determine gene expression levels using comboBox GeneExpression "Indicates whether to average the expression all probesets, as opposed to constitutive only, to calculate a gene expression value for an associated gene." constitutive probesets|core probesets NA constitutive probesets|core probesets NA constitutive probesets|known exons constitutive exons|expressed exons|known exons|known junctions NA +include_raw_data Include replicate experiment values in export comboBox GeneExpression "Whether or not to include replicate data in the ExpressionBuilder gene expression export file, as opposed to summary statistics (average, t-test p, folds, etc.)." yes|no yes|no yes|no yes|no yes|no yes|no yes|no +probability_algorithm Comparison group test statistic comboBox GeneExpression moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums moderated t-test|moderated Welch t-test|unpaired t-test|paired t-test|Kolmogorov Smirnov|Mann Whitney U|Rank Sums +FDR_statistic False discovery rate statistic comboBox GeneExpression Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value Benjamini-Hochberg|q-value +batch_effects Remove batch effects (Combat) comboBox GeneExpression yes|no yes|no yes|no yes|no yes|no yes|no yes|no +marker_finder Predict condition-specific biomarkers comboBox GeneExpression yes|no yes|no yes|no yes|no yes|no yes|no yes|no +visualize_results Perform expression clustering and visual QC comboBox GeneExpression yes|no yes|no yes|no yes|no yes|no yes|no yes|no +run_lineage_profiler Perform cell profiling with LineageProfiler comboBox GeneExpression yes|no yes|no yes|no yes|no yes|no yes|no yes|no +run_goelite Analyze ontologies and pathways with GO-Elite comboBox GeneExpression run immediately|decide later run immediately|decide later run immediately|decide later run immediately|decide later run immediately|decide later run immediately|decide later run immediately|decide later +input_filtered_dir Select a prior generated AltAnalyze results project folder folder InputFilteredFiles note: all files in this directory should be built by AltAnalyze from a prior analysis. "note: if not selected, all filtered probe set expression files in\k'AltExpression//'\kwill be used and results written to 'AltResults/AlternativeOutput'.\k" --- --- --- NA --- --- NA +input_external_dir Select a directory with lists of alternative probesets and scores. folder InputExternalFiles "note: all files in this directory should contain probe set IDs and scores\k\k(e.g., FIRMA and MADS results). Files should have at least 3 columns\k(1) probe set ID, (2) normalized fold change and (3) splicing p-value." --- --- --- NA --- --- NA +analysis_method Select the alternative exon algorithm comboBox AltAnalyze Alternative exon analysis method to apply to user data. splicing-index|FIRMA ASPIRE|linearregres splicing-index|FIRMA NA ASPIRE|linearregres|none MultiPath-PSI|ASPIRE|linearregres|none NA +additional_algorithms Individual probeset analysis method comboBox AltAnalyze NA NA NA NA splicing-index|FIRMA|none splicing-index|none NA +filter_probe_types Select probe sets to include comboBox AltAnalyze Different options for which sets of probe sets to include in the alternative exon analysis. core|extended|full all|exons-only|junctions-only|combined-junctions NA NA all|combined-junctions all|combined-junctions NA +analyze_all_conditions Type of group comparisons to perform comboBox AltAnalyze Different options for which sets of probe sets to include in the alternative exon analysis. pairwise|all groups|both pairwise|all groups|both pairwise|all groups|both NA pairwise|all groups|both pairwise|all groups|both NA +p_threshold Max MiDAS/normalized intensity p-value enter AltAnalyze "Maximum user defined p-value to filter constitutive corrected alternative exon analysis t-test. For AltMouse analyses, only one of the two probe sets is required to meet this threshold." --- --- --- NA --- --- NA +alt_exon_fold_cutoff Minimum alternative exon score enter AltAnalyze Fold or ASPIRE minimal score for inclusion with results from the alternative exon analysis (non-log). --- --- --- NA --- --- NA +additional_score Individual probeset min alt. exon score enter AltAnalyze Fold or ASPIRE minimal score for inclusion with results from the alternative exon analysis (non-log). NA --- NA NA --- --- NA +permute_p_threshold Maximum reciprocal junction p-value enter AltAnalyze Maximum allowed permutation p-value. NA --- NA NA --- --- NA +gene_expression_cutoff Maximum absolute gene-expression change enter AltAnalyze Absolute maximum gene-expression fold change allowed between pairwise-comparisons for alternative exon analysis (otherwise gene is excluded). --- --- --- NA --- --- NA +remove_intronic_junctions Remove novel-intron splice junctions comboBox AltAnalyze "Indicates whether to remove junctions with splice-site aligning to intron regions only (e.g., I1.1_1234-I1.1_1256)" NA NA NA NA NA yes|no NA +perform_permutation_analysis Perform permutation analysis comboBox AltAnalyze Indicates whether to perform permutation analysis of ASPIRE or linearegress scores to calculate a statistical likelihood value. NA yes|no NA NA yes|no yes|no NA +export_splice_index_values Export all normalized intensities single-checkbox AltAnalyze Export raw constitutive corrected probe set expression values for replicates. Useful for expression clustering of alternative exon changes. yes|no yes|no yes|no NA yes|no yes|no NA +run_MiDAS Calculate MiDAS p-values single-checkbox AltAnalyze Export re-formatted input for analysis in MiDAS through an external application (Affymetrix Power Tools). yes|no NA yes|no NA yes|no yes|no NA +calculate_splicing_index_p Calculate normalized intensity p-values single-checkbox AltAnalyze yes|no NA yes|no NA NA NA NA +filter_for_AS Filter results for predicted AS single-checkbox AltAnalyze yes|no yes|no yes|no NA yes|no yes|no NA +analyze_functional_attributes Align probesets to protein domains using comboBox AltAnalyze direct-alignment|inferred comparison direct-alignment|inferred comparison direct-alignment|inferred comparison NA direct-alignment|inferred comparison direct-alignment|inferred comparison NA +microRNA_prediction_method # of agreeing miRNA databases needed comboBox AltAnalyze Include binding sites only present in multiple analyzed micoRNA target databases or in at least one databases. one|two or more one|two or more one|two or more NA one|two or more one|two or more NA +pick_filtering_options Filter results by which algorithms multiple-checkbox Advanced SI fold|SI p-value|MiDAS p-value NA NA NA NA NA NA +avg_all_for_ss_for_AS Use known exons probesets\k to derive gene expression\k versus constitutive only radio Advanced constitutive probesets|known exons constitutive probesets|known exons NA NA constitutive probesets|known exons constitutive exons|known exons NA +new_species_code Two letter species code (e.g. Hs) enter NewSpecies --- --- --- --- --- --- --- +new_species_name Species name (e.g. Homo sapiens) enter NewSpecies --- --- --- --- --- --- --- +new_manufacturer Choose vendor comboBox NewSpecies --- --- --- --- --- --- --- +allowed_array_systems Choose array or data-type comboBox NewSpecies2 --- --- --- --- --- --- --- +ge_fold_cutoffs Minimum gene expression fold change enter GOElite --- --- --- --- --- --- --- +ge_pvalue_cutoffs Maximum gene expression ttest p-value enter GOElite --- --- --- --- --- --- --- +ge_ptype Filter based on the following p-value comboBox GOElite rawp|adjp rawp|adjp rawp|adjp rawp|adjp rawp|adjp rawp|adjp rawp|adjp +filter_method Prune Ontology terms using comboBox GOElite z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination z-score|gene number|combination +z_threshold Z-score cutoff for initial filtering (> 0) enter GOElite --- --- --- --- --- --- --- +p_val_threshold Enter permuted p-value cutoff (between 0-1) enter GOElite --- --- --- --- --- --- --- +change_threshold Enter minimum number of changed genes enter GOElite --- --- --- --- --- --- --- +ORA_algorithm Select the algorithm to use for ORA comboBox GOElite Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test Permute p-value|Fisher Exact Test +resources_to_analyze Only analyze the following resource(s) comboBox GOElite all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology all|Pathways|Gene Ontology +pathway_permutations Number of permutations for ORA enter GOElite --- --- --- --- --- --- --- +mod Select primary relational gene system comboBox GOElite Ensembl EntrezGene Ensembl Ensembl|EntrezGene|HMDB Ensembl Ensembl Ensembl|EntrezGene|HMDB +returnPathways Visualize all over-represented WikiPathways comboBox GOElite note: will add ~1 minute per pathway to be visualized yes|no yes|no yes|no yes|no yes|no yes|no yes|no +get_additional Download/update additional resources comboBox GOElite None None None None None None None +inputIDs (optional) Analyze these gene symbols enter InputGOEliteFiles --- --- --- --- --- --- --- +elite_input_dir (optional) Select a directory of GO-Elite formatted input file(s). folder InputGOEliteFiles --- --- --- --- --- --- --- +elite_denom_dir (optional) Select a directory of GO-Elite formatted denominator file(s). folder InputGOEliteFiles --- --- --- --- --- --- --- +elite_output_dir Select an GO-Elite output directory folder InputGOEliteFiles "note: by default, the output will be stored in a set of new directories under\kthe same directory as the input expression file." --- --- --- --- --- --- --- +input_cluster_file Select the tab-delimited expression file for clustering file heatmap note: the expression file must have an annotation row and annotation column.\k Log2 values recommended. Results saved to the folder 'DataPlots'.\k --- --- --- --- --- --- --- +column_metric Select the column clustering metric comboBox heatmap http://docs.scipy.org/doc/scipy/reference/spatial.distance.html cosine braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule +column_method Select the column clustering method comboBox heatmap http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach +row_metric Select the row clustering metric comboBox heatmap http://docs.scipy.org/doc/scipy/reference/spatial.distance.html cosine braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule +row_method Select the row clustering method comboBox heatmap http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html weighted average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach +color_selection Choose a color scheme comboBox heatmap note: colors are indicated as up-null-down http://matplotlib.sourceforge.net/examples/pylab_examples/show_colormaps.html red-black-sky red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys red-white-blue|red-black-sky|red-black-blue|red-black-green|yellow-black-blue|green-white-purple|coolwarm|seismic|yellow_orange_red|Greys +cluster_rows Cluster rows comboBox heatmap yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +cluster_columns Cluster columns comboBox heatmap yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +normalization Normalize rows relative to comboBox heatmap row median NA|row mean|row median NA|row mean|row median NA|row mean|row median NA|row mean|row median NA|row mean|row median NA|row mean|row median NA|row mean|row median +transpose Transpose matrix comboBox heatmap no yes|no yes|no yes|no yes|no yes|no yes|no yes|no +contrast Heatmap color contrast level (scaling factor) enter heatmap 3 3 3 3 3 3 3 +GeneSetSelection (optional) Select GeneSet/Ontology to filter comboBox heatmap +PathwaySelection (optional) Select a specific GeneSet multiple-comboBox heatmap +OntologyID (optional) Type a pathway ID or Ontology ID enter heatmap --- --- --- --- --- --- --- +GeneSelection (optional) Type a gene to get top correlated enter heatmap --- --- --- --- --- --- --- +JustShowTheseIDs (optional) Display only selected gene IDs enter heatmap --- --- --- --- --- --- --- +CorrelationCutoff (optional) Get all correlated genes with rho > enter heatmap --- --- --- --- --- --- --- +HeatmapAdvanced Additional correlation options multiple-comboBox heatmap None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle None Selected|Exclude Cell Cycle Effects|Top Correlated Only|Positive Correlations Only|Intra-Correlated Only|Perform Iterative Discovery|Correlation Only to Guides|Perform Monocle +ClusterGOElite (optional) Perform GeneSet cluster enrichment comboBox heatmap +heatmapGeneSets (optional) Store filtered genes for later enter heatmap --- --- --- --- --- --- --- +input_cluster_file Select the tab-delimited expression file for clustering file PCA note: the expression file must have an annotation row and annotation column.\k Log2 values recommended. Results saved to the folder 'DataPlots'.\k --- --- --- --- --- --- --- +pca_labels Display sample labels next to each object comboBox PCA yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +transpose Transpose matrix comboBox PCA no yes|no yes|no yes|no yes|no yes|no yes|no yes|no +zscore Z-score normalize comboBox PCA yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +reimportModelScores Re-import prior t-SNE plot coordinates comboBox PCA yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +pca_algorithm Algorithm to use comboBox PCA SVD SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP SVD|Eigen Vectors|t-SNE|UMAP +dimensions Dimensions to display comboBox PCA 3D 2D|3D 2D|3D 2D|3D 2D|3D 2D|3D 2D|3D 2D|3D +colorByGene (Optional) Enter a gene to color the PCA by enter PCA --- --- --- --- --- --- --- +pcaGeneSets (Optional) Store top 200 component genes as enter PCA --- --- --- --- --- --- --- +input_lineage_file Select the tab-delimited expression file for cell classification file LineageProfiler Recommended: Select an un-filtered log2 expression file (query) --- --- --- --- --- --- --- +classificationAnalysis Analysis to perform comboBox LineageProfiler cellHarmony cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler cellHarmony|LineageProfiler +markerFinder_file Select an ICGS or MarkerFinder reference classification file file LineageProfiler Recommended: Any heatmap text output file from AltAnalyze (reference) --- --- --- --- --- --- --- +labels (optional) Supply a reference cell annotation file file LineageProfiler --- --- --- --- --- --- --- +performDiffExp Perform differential expression analysis comboBox LineageProfiler yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +returnCentroids Align to cluster centroid instead of cell comboBox LineageProfiler community community|centroid|cell community|centroid|cell community|centroid|cell community|centroid|cell community|centroid|cell community|centroid|cell community|centroid|cell +PearsonThreshold Enter cellHarmony Pearson correlation threshold enter LineageProfiler 0.4 --- --- --- --- --- --- --- +FoldCutoff Enter differential expression fold threshold enter LineageProfiler 1.5 --- --- --- --- --- --- --- +pvalThreshold Enter differential expression p-value threshold enter LineageProfiler 0.05 --- --- --- --- --- --- --- +UseAdjPval Filter by adjusted rather than raw p-value comboBox LineageProfiler no yes|no yes|no yes|no yes|no yes|no yes|no yes|no +geneModel_file (LineageProflier) Optional restricted marker models file LineageProfiler Note: These options are not applicable for cellHarmony --- --- --- --- --- --- --- +compendiumType (LineageProflier) Data type to evaluate comboBox LineageProfiler protein_coding protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon protein_coding|ncRNA|AltExon +compendiumPlatform (LineageProflier) Reference platform comboBox LineageProfiler exon exon|gene|3'array exon|gene|3'array exon|gene|3'array exon|gene|3'array exon|gene|3'array exon|gene|3'array exon|gene|3'array +modelDiscovery (LineageProflier) Iterative model discovery comboBox LineageProfiler no yes|no yes|no yes|no yes|no yes|no yes|no yes|no +input_data_file Select the tab-delimited expression file for ID Translation file IDConverter --- --- --- --- --- --- --- +input_source Select the file ID system (first column) comboBox IDConverter --- --- --- --- --- --- --- +output_source Select the ID system to add to this file comboBox IDConverter --- --- --- --- --- --- --- +input_file1 Select the first file to merge file MergeFiles --- --- --- --- --- --- --- +input_file2 Select the second file to merge file MergeFiles --- --- --- --- --- --- --- +input_file3 Select the third file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file4 Select the fourth file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +join_option Join files based on their comboBox MergeFiles Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union +ID_option Only return one-to-one ID relationships comboBox MergeFiles False|True False|True False|True False|True False|True False|True False|True +output_merge_dir Select the folder to save the merged file folder MergeFiles --- --- --- --- --- --- --- +venn_input_file1 Select the first file to examine file VennDiagram --- --- --- --- --- --- --- +venn_input_file2 Select the second file to examine file VennDiagram --- --- --- --- --- --- --- +venn_input_file3 Select the third file to examine (optional) file VennDiagram --- --- --- --- --- --- --- +venn_input_file4 Select the fourth file to examine (optional) file VennDiagram --- --- --- --- --- --- --- +venn_output_dir Select the folder to save the examined file folder VennDiagram --- --- --- --- --- --- --- +altanalyze_results_folder Select the AltAnalyze AltResults folder folder AltExonViewer --- --- --- --- --- --- --- +data_type Type of exon-level data to visualize comboBox AltExonViewer raw expression raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index raw expression|splicing-index +show_introns Show introns in addition to exons comboBox AltExonViewer no no|yes no|yes no|yes no|yes no|yes no|yes no|yes +gene_symbol Enter one or more gene symbols (space delimited) enter AltExonViewer --- --- --- --- --- --- --- +analysisType View as: comboBox AltExonViewer graph-plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot graph-plot|heatmap|Sashimi-Plot +altgenes_file (optional) Select a file with the first column as genes file AltExonViewer --- --- --- --- --- --- --- +Genes_network Option 1: Build network from entered IDs enter network --- --- --- --- --- --- --- +input_ID_file Option 2: Build network from input ID file (or SIF) file network --- --- --- --- --- --- --- +GeneSetSelection_network Option 3: Build network from GeneSet Type comboBox network --- --- --- --- --- --- --- +inputType_network (Option 2) File type comboBox network ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF ID list|GO-Elite input file|SIF +PathwaySelection_network (Option 3) Select a specific GeneSet multiple-comboBox network --- --- --- --- --- --- --- +OntologyID_network (Option 3) Type a pathway ID or Ontology ID enter network --- --- --- --- --- --- --- +interactionDirs Select one or more interaction databases multiple-comboBox network WikiPathways|KEGG|BioGRID|TFTargets|common-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank WikiPathways|KEGG|BioGRID|TFTargets|common-microRNATargets|all-microRNATargets|common-DrugBank|all-DrugBank +degrees Select interactions of the following type comboBox network direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path direct|indirect|all possible|shortest path +update_interactions Download/update interaction databases comboBox network no yes|no yes|no yes|no yes|no yes|no yes|no yes|no +elite_exp_file (optional) Select an input expression file file network note: File should be in GO-Elite format (see GO-Elite help) --- --- --- --- --- --- --- +includeExpIDs_network Prioritize IDs from expression file in network comboBox network yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +output_net_folder Save results to folder network --- --- --- --- --- --- --- +ExpressionCutoff "Gene expression (TPM, RPKM) cutoff" enter PredictGroups 1 --- --- --- --- --- --- --- +CountsCutoff Gene read-count cutoff (not always available) enter PredictGroups 0 --- --- --- --- --- --- --- +FoldDiff Fold change filter cutoff enter PredictGroups 4 --- --- --- --- --- --- --- +rho_cutoff Minimum Pearson correlation cutoff enter PredictGroups 0.2 --- --- --- --- --- --- --- +SamplesDiffering Minimum number of cells differing enter PredictGroups 4 --- --- --- --- --- --- --- +dynamicCorrelation ICGS will identify an optimal correlation cutoff comboBox PredictGroups yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +removeOutliers Remove low expression outlier samples comboBox PredictGroups no yes|no yes|no yes|no yes|no yes|no yes|no yes|no +featuresToEvaluate Features to evaluate comboBox PredictGroups Genes Genes|AltExon|Both Genes|AltExon|Both Genes|AltExon|Both Genes Genes|AltExon|Both Genes|AltExon|Both Genes +restrictBy Restrict genes to protein coding comboBox PredictGroups yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +column_metric_predict Select the column clustering metric comboBox PredictGroups http://docs.scipy.org/doc/scipy/reference/spatial.distance.html cosine braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule +column_method_predict Select the column clustering method comboBox PredictGroups http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach +k (optional) number of user-defined ICGS clusters (k) enter PredictGroups --- --- --- --- --- --- --- +GeneSelectionPredict (optional) Enter genes to build clusters from (guides) enter PredictGroups --- --- --- --- --- --- --- +GeneSetSelectionPredict (optional) Or select guide GeneSet/Ontology comboBox PredictGroups +PathwaySelectionPredict (optional) Select guide specific GeneSet(s) multiple-comboBox PredictGroups +JustShowTheseIDsPredict (optional) Display selected gene IDs in results enter PredictGroups --- --- --- --- --- --- --- +excludeCellCycle (optional) Eliminate cell cycle effects comboBox PredictGroups no conservative|stringent|no conservative|stringent|no conservative|stringent|no conservative|stringent|no conservative|stringent|no conservative|stringent|no conservative|stringent|no \ No newline at end of file diff --git a/ExpressionBuilder.py b/ExpressionBuilder.py index d5d1383..0a12a72 100755 --- a/ExpressionBuilder.py +++ b/ExpressionBuilder.py @@ -323,7 +323,12 @@ def simplerGroupImport(group_dir): fn = filepath(group_dir) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) - try: sample_filename,group_number,group_name = string.split(data,'\t') + try: + group_data = string.split(data,'\t') + sample_filename = group_data[0] + group_name = group_data[-1] + if len(group_data)>3: + forceError except Exception: #print 'Non-Standard Groups file or missing relationships' print string.split(data,'\t')[:10], 'more than 3 columns present in groups file' @@ -331,7 +336,7 @@ def simplerGroupImport(group_dir): sample_group_db[sample_filename] = group_name return sample_group_db -def simpleGroupImport(group_dir,splitHeaders=False, ignoreComps=False): +def simpleGroupImport(group_dir,splitHeaders=False, ignoreComps=False, reverseOrder=False): """ Used for calculating fold changes prior to clustering for individual samples (genomtric folds) """ import collections @@ -366,8 +371,12 @@ def simpleGroupImport(group_dir,splitHeaders=False, ignoreComps=False): if splitHeaders: if '~' in sample_filename: sample_filename = string.split(sample_filename,'~')[-1] group_sample_db[sample_filename] = group_name+':'+sample_filename - try: group_name_sample_db[group_name].append(group_name+':'+sample_filename) - except Exception: group_name_sample_db[group_name] = [group_name+':'+sample_filename] + if reverseOrder==False: + try: group_name_sample_db[group_name].append(group_name+':'+sample_filename) + except Exception: group_name_sample_db[group_name] = [group_name+':'+sample_filename] + else: + try: group_name_sample_db[group_name].append(sample_filename) + except Exception: group_name_sample_db[group_name] = [sample_filename] sample_list.append(sample_filename) group_db[sample_filename] = group_name @@ -375,15 +384,16 @@ def simpleGroupImport(group_dir,splitHeaders=False, ignoreComps=False): ### Get the comparisons indicated by the user if ignoreComps==False: ### Not required for some analyses - comps_name_db,comp_groups = simpleCompsImport(group_dir,group_name_db) + comps_name_db,comp_groups = simpleCompsImport(group_dir,group_name_db,reverseOrder=reverseOrder) else: comps_name_db={}; comp_groups=[] return sample_list,group_sample_db,group_db,group_name_sample_db,comp_groups,comps_name_db -def simpleCompsImport(group_dir,group_name_db): +def simpleCompsImport(group_dir,group_name_db,reverseOrder=False): """ Used for calculating fold changes prior to clustering for individual samples (genomtric folds) """ comps_dir = string.replace(group_dir,'groups.','comps.') - comps_name_db={} + import collections + comps_name_db=collections.OrderedDict() comp_groups=[] comps_dir = verifyExpressionFile(comps_dir) fn = filepath(comps_dir) @@ -391,6 +401,8 @@ def simpleCompsImport(group_dir,group_name_db): data = cleanUpLine(line) try: exp_group_num,con_group_num = string.split(data,'\t') + if reverseOrder: + con_group_num, exp_group_num = exp_group_num,con_group_num exp_group_name = group_name_db[exp_group_num] con_group_name = group_name_db[con_group_num] try: comps_name_db[con_group_name].append(exp_group_name) @@ -2678,7 +2690,10 @@ def compareJunctionExpression(gene): print incl_exp print excl_exp;sys.exit()""" #if 'ENSMUSG00000009350:E14.2_87617106-E15.1' in incl: print feature_exp_db[incl] - altexons = unique.unique(critical_junction_pair_db[incl,excl]) + try: + altexons = unique.unique(critical_junction_pair_db[incl,excl]) + except: + altexons=[] altexons = string.join(altexons,'|') if num_excl_events > num_incl_events: #print max_ratio, '\t',gene diff --git a/GO_Elite.py b/GO_Elite.py index d8e9e70..92dd95d 100755 --- a/GO_Elite.py +++ b/GO_Elite.py @@ -1849,10 +1849,12 @@ def __init__(self,null): self.log = open(log_file, "w") def write(self, message): - self.log = open(log_file, "a") - self.terminal.write(message) - self.log.write(message) - self.log.close() + try: + self.log = open(log_file, "a") + self.terminal.write(message) + self.log.write(message) + self.log.close() + except: pass def flush(self): pass diff --git a/InteractionBuilder.py b/InteractionBuilder.py index d468053..342a030 100755 --- a/InteractionBuilder.py +++ b/InteractionBuilder.py @@ -262,9 +262,11 @@ def importqueryResults(species,dir_file,id_db): if len(id_db)==0: ### Otherwise, already provided gene IDs to query translated=0 + count=0 try: x=0 for line in fileRead: + count+=1 try: data = cleanUpLine(line) t = string.split(data,'\t') @@ -809,7 +811,35 @@ def getGeneIDs(Genes): except Exception: input_IDs[i] = i ### Currently not dealt with return input_IDs +def remoteBuildNetworks(species, outputDir, interactions=['WikiPathways','KEGG','TFTargets']): + """ Attempts to output regulatory/interaction networks from a directory of input files """ + + directory = 'gene-mapp' + interactionDirs=[] + obligatorySet=[] ### Always include interactions from these if associated with any input ID period + secondarySet=[] + inputType = 'IDs' + degrees = 'direct' + + for i in interactions: + fn = filepath('AltDatabase/goelite/'+species+'/gene-interactions/Ensembl-'+i+'.txt') + interactionDirs.append(fn) + + pdfs=[] + dir_list = read_directory(outputDir) + for file in dir_list: + if 'GE.' in file: + input_file_dir = outputDir+'/'+file + + output_filename = buildInteractions(species,degrees,inputType,input_file_dir,outputDir,interactionDirs, + directory=outputDir,expressionFile=input_file_dir, IncludeExpIDs=True) + try: pdfs.append(output_filename[:-4]+'.pdf') + except: pass + return pdfs + if __name__ == '__main__': + remoteBuildNetworks('Mm', '/Users/saljh8/Desktop/DemoData/cellHarmony/Mouse_BoneMarrow/inputFile/cellHarmony/DifferentialExpression_Fold_2.0_adjp_0.05') + sys.exit() Species = 'Hs' Degrees = 2 inputType = 'IDs' diff --git a/LineageProfilerIterate.py b/LineageProfilerIterate.py index c5a3c4a..1a3eb33 100755 --- a/LineageProfilerIterate.py +++ b/LineageProfilerIterate.py @@ -1534,6 +1534,9 @@ def PearsonCorrelationAnalysis(sample_exp_db,tissue_exp_db): print "Program Error!!! The length of the input expression list does not match the reference expression list (could indicate duplicate sample names)" pass rho,p = stats.pearsonr(tissue_expression_list,sample_expression_list) + + if p == 1: ### When rho == nan... will break the program and result in miss-asignment of cells to clusters + rho = 0.0 #print rho,p #rho,p = stats.spearmanr(tissue_expression_list,sample_expression_list) try: pearson_list[sample].append(rho) @@ -1575,7 +1578,6 @@ def pearson(array1,array2): r = sum_a/math.sqrt(sum_b*sum_c) return r - def Median(array): array.sort() len_float = float(len(array)) @@ -2135,12 +2137,23 @@ def harmonizeClassifiedSamples(species,reference_exp_file, query_exp_file, class try: FoldCutoff = fl.FoldCutoff() except: FoldCutoff = 1.5 + + try: + if len(fl.Labels())>0: + customLabels = fl.Labels() + except: customLabels = None ### Output the alignment results and perform the differential expression analysis output_file,query_output_file,folds_file,DEGs_combined = importAndCombineExpressionFiles(species,reference_exp_file, query_exp_file,classification_file,pearsonThreshold=pearsonThreshold,peformDiffExpAnalysis=peformDiffExpAnalysis, - pvalThreshold=pvalThreshold,fold_cutoff=FoldCutoff,use_adjusted_pval=use_adjusted_pval) + pvalThreshold=pvalThreshold,fold_cutoff=FoldCutoff,use_adjusted_pval=use_adjusted_pval,customLabels=customLabels) + output_dir = export.findParentDir(output_file) + + if len(folds_file)<1: + """ If performDiffExp==False, see if a prior folds file exists """ + folds_file = string.replace(output_file,'-ReOrdered','-AllCells-folds') + ### Output the cellHarmony heatmaps from visualization_scripts import clustering row_method = None; row_metric = 'cosine'; column_method = None; column_metric = 'euclidean'; color_gradient = 'yellow_black_blue' @@ -2151,26 +2164,78 @@ def harmonizeClassifiedSamples(species,reference_exp_file, query_exp_file, class else: display = True display = False + print 'Exporting cellHarmony heatmaps...' + heatmaps_dir = output_dir+'/heatmaps/' + try: os.mkdir(heatmaps_dir) + except: pass try: graphics = clustering.runHCexplicit(query_output_file, [], row_method, row_metric, column_method, column_metric, color_gradient, transpose, Normalize=Normalize, contrast=5, display=display) - + plot = graphics[-1][-1][:-4]+'.pdf' + file = graphics[-1][-1][:-4]+'.txt' + shutil.copy(plot,output_dir+'/heatmaps/heatmap-query-aligned.pdf') + shutil.copy(file,output_dir+'/heatmaps/heatmap-query-aligned.txt') + graphics = clustering.runHCexplicit(output_file, [], row_method, row_metric, column_method, column_metric, color_gradient, transpose, Normalize=Normalize, contrast=5, display=display) + plot = graphics[-1][-1][:-4]+'.pdf' + file = graphics[-1][-1][:-4]+'.txt' + shutil.copy(plot,output_dir+'/heatmaps/heatmap-all-cells-combined.pdf') + shutil.copy(file,output_dir+'/heatmaps/heatmap-all-cells-combined.txt') + + except: print traceback.format_exc() zscore = True graphics=[] transpose='no' + + try: fl.setSpecies(species); fl.setVendor("3'array") + except: + import UI + fl = UI.ExpressionFileLocationData(folds_file,'','','') + fl.setSpecies(species); fl.setVendor("3'array") + fl.setOutputDir(output_dir) + + try: platform=platform + except: + platform = 'RNASeq' ### Build-UMAP plot import UI + import warnings + warnings.filterwarnings('ignore') try: - UI.performPCA(output_file, 'no', 'UMAP', False, None, plotType='2D', + try: os.mkdir(fl.OutputDir()+'/UMAP-plots') + except: pass + """ Output UMAP combined plot colored by reference and query cell identity """ + plot = UI.performPCA(output_file, 'no', 'UMAP', False, None, plotType='2D', display=False, geneSetName=None, species=species, zscore=True, reimportModelScores=False, separateGenePlots=False, returnImageLoc=True) + plot = plot[-1][-1][:-4]+'.pdf' + shutil.copy(plot,fl.OutputDir()+'/UMAP-plots/UMAP-query-vs-ref.pdf') + + """ Output UMAP combined plot colored by cell tates """ + plot = UI.performPCA(output_file, 'no', 'UMAP', False, None, plotType='2D', + display=False, geneSetName=None, species='Mm', zscore=True, reimportModelScores=True, + separateGenePlots=False, returnImageLoc=True, forceClusters=True) + plot = plot[-1][-1][:-4]+'.pdf' + shutil.copy(plot,fl.OutputDir()+'/UMAP-plots/UMAP-query-vs-ref-clusters.pdf') + + """ Output individual UMAP plots colored by cell tates """ + groups_file = string.replace(output_file,'exp.','groups.') + plots = UI.performPCA(output_file, 'no', 'UMAP', False, None, plotType='2D', + display=False, geneSetName=None, species='Mm', zscore=True, reimportModelScores=True, + separateGenePlots=False, returnImageLoc=True, forceClusters=True, maskGroups=groups_file) + for plot in plots: + plot = plot[-1][:-4]+'.pdf' + + if '-cellHarmony-Reference-' in plot: + shutil.copy(plot,fl.OutputDir()+'/UMAP-plots/UMAP-ref-clusters.pdf') + else: + shutil.copy(plot,fl.OutputDir()+'/UMAP-plots/UMAP-query-clusters.pdf') except: try: print 'UMAP error encountered (dependency not met), trying t-SNE' @@ -2178,37 +2243,213 @@ def harmonizeClassifiedSamples(species,reference_exp_file, query_exp_file, class display=False, geneSetName=None, species=species, zscore=True, reimportModelScores=False, separateGenePlots=False, returnImageLoc=True) except: pass + + useMarkerFinder=False - try: fl.setSpecies(species); fl.setVendor("3'array") - except: - import UI - fl = UI.ExpressionFileLocationData(folds_file,'','','') - fl.setSpecies(species); fl.setVendor("3'array") - fl.setOutputDir(export.findParentDir(folds_file)) - ### Run MarkerFinder - if len(DEGs_combined)>0: - fl.setRPKMThreshold(0.00) - fl.setCorrelationDirection('up') - compendiumType = 'protein_coding' - genesToReport = 100000 - correlateAll = True - import markerFinder - markerFinder.analyzeData(folds_file,species,platform,compendiumType,geneToReport=genesToReport,correlateAll=correlateAll,AdditionalParameters=fl,logTransform=False) - export.deleteFolder(fl.OutputDir()+'/MarkerFinder-positive') - os.rename(fl.OutputDir()+'/MarkerFinder',fl.OutputDir()+'/MarkerFinder-positive') - fl.setCorrelationDirection('down') - markerFinder.analyzeData(folds_file,species,platform,compendiumType,geneToReport=genesToReport,correlateAll=correlateAll,AdditionalParameters=fl,logTransform=False) - export.deleteFolder(fl.OutputDir()+'/MarkerFinder-negative') - os.rename(fl.OutputDir()+'/MarkerFinder',fl.OutputDir()+'/MarkerFinder-negative') - exportCombinedMarkerFinderResults(folds_file,fl.OutputDir()+'/MarkerFinder-positive',fl.OutputDir()+'/MarkerFinder-negative',DEGs_combined,species) - -def exportCombinedMarkerFinderResults(folds_file,positive_dir,negative_dir,DEGs_combined,species): - positive_dir+= '/AllGenes_correlations-ReplicateBased.txt' - negative_dir+= '/AllGenes_correlations-ReplicateBased.txt' + if len(DEGs_combined) and useMarkerFinder: + exportCombinedMarkerFinderResults(species,platform,fl,folds_file,DEGs_combined) + elif len(DEGs_combined): + exportPvalueRankedGenes(species,platform,fl,folds_file,DEGs_combined) + + ### Cleanup directory + cleanupOutputFolder(output_dir) + +def cleanupOutputFolder(output_dir): + """ Reorganizes cellHarmony output folder to be easier to navigate """ + + other_files_dir = output_dir+'/OtherFiles/' + + def createFolder(folder): + try: os.mkdir(folder) + except: pass + createFolder(other_files_dir) + createFolder(other_files_dir+'DataPlots/') + createFolder(other_files_dir+'GO-Elite/') + + dir_list = unique.read_directory(output_dir) + for file in dir_list: + if 'exp.' in file or 'groups.' in file or 'comps.' in file: + shutil.move(output_dir+file,other_files_dir+file) + def moveFolder(folder): + try: + dir_list = unique.read_directory(output_dir+folder) + for file in dir_list: + shutil.move(output_dir+folder+file,other_files_dir+folder+file) + export.deleteFolder(output_dir+folder) + except: + pass + + moveFolder('DataPlots/') + moveFolder('GO-Elite/') + +def exportPvalueRankedGenes(species,platform,fl,folds_file,DEGs_combined): + """ Produce a hierarchically ordered heatmap of differential expression differences + across cell-states to provide a hollistic representation of impacted genes """ + + """ Write out headers for OrganizedDifferential results """ + + export_file = fl.OutputDir()+'/OrganizedDifferentials.txt' + export_object = export.ExportFile(export_file) + + """ Import the folds and headers for all genes/comparisons """ + headers,matrix = getDataMatrix(folds_file) + display_headers=[] + cell_states=[] + for h in headers: + h = string.replace(h,'-fold','') + display_headers.append(h+':'+h) + cell_states.append(h) + + export_object.write(string.join(['UID']+display_headers,'\t')+'\n') + + top_comparison_genes={} + comparisons={} + for gene in DEGs_combined: + pval,direction,comparison = DEGs_combined[gene] + ### Group by comparison and remove ".txt" suffix + comparison = string.split(comparison,'_vs_')[0] + comparison = string.replace(comparison,'GE.','') + comparison = string.replace(comparison,'PSI.','') + try: top_comparison_genes[comparison+"--"+direction].append((pval,gene)) + except: top_comparison_genes[comparison+"--"+direction] = [(pval,gene)] + comparisons[comparison]=[] + + """ Augment the ordered cell-state header list with the other broader comparisons """ + cluster_comparisons=[] + for comparison in comparisons: + if comparison not in cell_states: + if '-Query' not in comparison: + bulk_comparison = comparison + else: + first_group = string.split(comparison,'|')[0] + index = cell_states.index(first_group) + cluster_comparisons.append([index,comparison]) + cluster_comparisons.sort() + cell_states.reverse() + for (rank,comparison) in cluster_comparisons: + cell_states.append(comparison) + try: cell_states.append(bulk_comparison) + except: pass ### If no bulk differences are ranked better than cell-state differences + cell_states.reverse() + + for h in cell_states: + for direction in ('positive','negative'): + signature = h+'--'+direction + top_comparison_genes[signature].sort() + for (pval,gene) in top_comparison_genes[signature]: + try: export_object.write(string.join([signature+':'+gene]+map(str,matrix[gene]),'\t')+'\n') + except: pass ### for Ensembl IDs + export_object.close() + + from visualization_scripts import clustering + row_method = None; row_metric = 'cosine'; column_method = None; column_metric = 'euclidean'; color_gradient = 'yellow_black_blue' + transpose = False; Normalize=False; display = False + import UI + gsp = UI.GeneSelectionParameters(species,"RNASeq","RNASeq") + gsp.setPathwaySelect('None Selected') + gsp.setGeneSelection('') + gsp.setOntologyID('') + gsp.setGeneSet('None Selected') + gsp.setJustShowTheseIDs('') + gsp.setTranspose(False) + gsp.setNormalize(False) + gsp.setGeneSelection('') + if species == 'Mm' or species == 'Hs': + gsp.setClusterGOElite('PathwayCommons') #GeneOntology + else: + gsp.setClusterGOElite('GeneOntology') + + graphics = clustering.runHCexplicit(export_file, [], row_method, row_metric, + column_method, column_metric, color_gradient, gsp, Normalize=Normalize, + contrast=3, display=display) + shutil.copy(graphics[-1][-1][:-4]+'.pdf',export_file[:-4]+'.pdf') + + from import_scripts import OBO_import; import ExpressionBuilder; import RNASeq + gene_to_symbol_db = ExpressionBuilder.importGeneAnnotations(species) + symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol_db) + try: + TFs = RNASeq.importGeneSets('Biotypes',filterType='transcription regulator',geneAnnotations=gene_to_symbol_db,speciesName = species) + gsp.setJustShowTheseIDs(string.join(TFs.keys(),' ')) + gsp.setClusterGOElite('MergedTFTargets') #GeneOntology + + graphics = clustering.runHCexplicit(export_file, [], row_method, row_metric, + column_method, column_metric, color_gradient, gsp, Normalize=Normalize, + contrast=3, display=display) + shutil.copy(graphics[-1][-1][:-4]+'.pdf',export_file[:-4]+'-TFs.pdf') + except ZeroDivisionError: + pass + +def editFoldsGroups(folds_file): + folds_copy = string.replace(folds_file,'-AllCells-folds','-AllCells-folds2') + + original_groups_file = string.replace(folds_file,'exp.','groups.') + output_groups_file = string.replace(folds_copy,'exp.','groups.') + eo = export.ExportFile(output_groups_file) + eo.write('Null\t1\tNull\n') + for line in open(original_groups_file,'rU').xreadlines(): + data = cleanUpLine(line) + t = string.split(data,'\t') + eo.write(t[0]+'\t2\tall\n') + eo.close() + eo = export.ExportFile(string.replace(output_groups_file,'groups.','comps.')) + eo.close() + + eo = export.ExportFile(folds_copy) + header=True + for line in open(folds_file,'rU').xreadlines(): + data = cleanUpLine(line) + t = string.split(data,'\t') + if header: + header=False + eo.write(string.join([t[0],'Null']+t[1:],'\t')+'\n') + else: + eo.write(string.join([t[0],'0']+t[1:],'\t')+'\n') + eo.close() + return folds_copy + +def exportCombinedMarkerFinderResults(species,platform,fl,folds_file,DEGs_combined): + """ Use MarkerFinder to define the major gene expression associated patterns """ + + fl.setRPKMThreshold(0.00) + fl.setCorrelationDirection('up') + compendiumType = 'protein_coding' + genesToReport = 100000 + correlateAll = True + import markerFinder + directories=[] + + markerFinder.analyzeData(folds_file,species,platform,compendiumType,geneToReport=genesToReport,correlateAll=correlateAll,AdditionalParameters=fl,logTransform=False) + export.deleteFolder(fl.OutputDir()+'/MarkerFinder-positive') + os.rename(fl.OutputDir()+'/MarkerFinder',fl.OutputDir()+'/MarkerFinder-positive') + directories.append(fl.OutputDir()+'/MarkerFinder-positive') + + fl.setCorrelationDirection('down') + markerFinder.analyzeData(folds_file,species,platform,compendiumType,geneToReport=genesToReport,correlateAll=correlateAll,AdditionalParameters=fl,logTransform=False) + export.deleteFolder(fl.OutputDir()+'/MarkerFinder-negative') + os.rename(fl.OutputDir()+'/MarkerFinder',fl.OutputDir()+'/MarkerFinder-negative') + directories.append(fl.OutputDir()+'/MarkerFinder-negative') + + #try: shutil.copyfile(src,dst) + #except: pass + fl.setCorrelationDirection('up') + + folds_copy = editFoldsGroups(folds_file) + + markerFinder.analyzeData(folds_copy,species,platform,compendiumType,geneToReport=genesToReport,correlateAll=correlateAll,AdditionalParameters=fl,logTransform=False) + export.deleteFolder(fl.OutputDir()+'/MarkerFinder-all-up') + os.rename(fl.OutputDir()+'/MarkerFinder',fl.OutputDir()+'/MarkerFinder-all-up') + directories.append(fl.OutputDir()+'/MarkerFinder-all-up') + + fl.setCorrelationDirection('down') + markerFinder.analyzeData(folds_copy,species,platform,compendiumType,geneToReport=genesToReport,correlateAll=correlateAll,AdditionalParameters=fl,logTransform=False) + export.deleteFolder(fl.OutputDir()+'/MarkerFinder-all-down') + os.rename(fl.OutputDir()+'/MarkerFinder',fl.OutputDir()+'/MarkerFinder-all-down') + directories.append(fl.OutputDir()+'/MarkerFinder-all-down') + import collections marker_db=collections.OrderedDict() - + def importMarkerFinderResults(input_file,direction): header_row=True for line in open(input_file,'rU').xreadlines(): @@ -2219,6 +2460,7 @@ def importMarkerFinderResults(input_file,direction): header_row=False else: rho = float(rho) + if geneID in marker_db: ### Positive marker genes alt_cell_state,alt_direction,alt_rho = marker_db[geneID] @@ -2226,14 +2468,20 @@ def importMarkerFinderResults(input_file,direction): marker_db[geneID]=cell_state, direction, rho elif geneID in DEGs_combined: marker_db[geneID]=cell_state, direction, rho - importMarkerFinderResults(positive_dir,'positive') - importMarkerFinderResults(negative_dir,'negative') + + for file in directories: + file+='/AllGenes_correlations-ReplicateBased.txt' + if 'up' in file: + direction = 'postive' + else: + direction = 'negative' + importMarkerFinderResults(file,direction) ordered_markers={} for gene in marker_db: cell_state, direction, rho = marker_db[gene] - try: ordered_markers[cell_state].append([direction, rho, gene]) - except Exception: ordered_markers[cell_state]= [[direction, rho, gene]] + try: ordered_markers[cell_state+'_'+direction].append([direction, rho, gene]) + except Exception: ordered_markers[cell_state+'_'+direction]= [[direction, rho, gene]] for cell_state in ordered_markers: ordered_markers[cell_state].sort() ordered_markers[cell_state].reverse() @@ -2246,9 +2494,14 @@ def importMarkerFinderResults(input_file,direction): headers2.append(h+':'+h) export_object.write(string.join(['UID']+headers2,'\t')+'\n') for h in headers: - if h in ordered_markers: - for (direction, rho, gene) in ordered_markers[h]: - export_object.write(string.join([h+':'+gene]+map(str,matrix[gene]),'\t')+'\n') + h2 = h+'_positive' + if h2 in ordered_markers: + for (direction, rho, gene) in ordered_markers[h2]: + export_object.write(string.join([h2+':'+gene]+map(str,matrix[gene]),'\t')+'\n') + h2 = h+'_negative' + if h2 in ordered_markers: + for (direction, rho, gene) in ordered_markers[h2]: + export_object.write(string.join([h2+':'+gene]+map(str,matrix[gene]),'\t')+'\n') export_object.close() from visualization_scripts import clustering @@ -2278,7 +2531,7 @@ def Score(self): return self.score def AssignedClass(self): return self.assigned_class def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,classification_file,platform='RNASeq',pearsonThreshold=0.1, - peformDiffExpAnalysis=True, pvalThreshold=0.05,fold_cutoff=1.5, use_adjusted_pval=False): + peformDiffExpAnalysis=True, pvalThreshold=0.05,fold_cutoff=1.5, use_adjusted_pval=False, customLabels = None): """Harmonize the numerical types and feature IDs of the input files, then combine """ import collections @@ -2298,7 +2551,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl output_dir = root_dir+'/exp.'+string.replace(output_dir,'exp.','') output_dir =string.replace(output_dir,'-OutliersRemoved','') groups_dir = string.replace(output_dir,'exp.','groups.') - ref_exp_db,ref_headers,ref_col_clusters,cluster_format_reference = importExpressionFile(reference_exp_file) + ref_exp_db,ref_headers,ref_col_clusters,cluster_format_reference = importExpressionFile(reference_exp_file,customLabels=customLabels) cluster_results = clustering.remoteImportData(query_exp_file,geneFilter=ref_exp_db) if len(cluster_results[0])>0: filterIDs = ref_exp_db else: filterIDs = False @@ -2316,6 +2569,11 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl groups_file = string.replace(groups_file,'-centroid__','__') comps_file = string.replace(groups_file,'groups.','comps.') groups_reference_file = string.replace(groups_file,'-AllCells.txt','-Reference.txt') + + """ If reference labels provided, use these instead of cluster labels """ + if customLabels!=None: + groups_reference_file = customLabels + try: import ExpressionBuilder ### If using centroids for classification, merge with original reference groups sample_group_ref_db = ExpressionBuilder.simplerGroupImport(groups_reference_file) @@ -2329,6 +2587,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl sample_group_ref_db={} groups_to_reference_cells={} + column_clusters=[] final_clusters=[] numeric_cluster=True @@ -2382,8 +2641,10 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl if firstLine: firstLine=False header_row = values - score_index = header_row.index('Max-Rho') - class_index = header_row.index('Predicted Class') + try: score_index = header_row.index('Max-Rho') + except: score_index = header_row.index('Correlation') + try: class_index = header_row.index('Predicted Class') + except: class_index = header_row.index('Ref Barcode') else: sample = values[0] score = float(values[score_index]) @@ -2394,7 +2655,6 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl if len(groups_to_reference_cells)>0: ### Hence, centroids - replace centroids with individual reference cells assigned_class = groups_to_reference_cells[assigned_class][-1] cd = ClassificationData(sample,score,assigned_class) - #print sample, assigned_class;sys.exit() try: sample_classes[assigned_class].append([score,cd]) except Exception: sample_classes[assigned_class] = [[score,cd]] query_header_proppegated_clusters[sample]=assigned_class @@ -2448,12 +2708,13 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl new_headers.append(cd.Sample()) new_query_headers.append(cd.Sample()) - for sample in query_headers: - if ':' in sample: - sample_alt = string.split(sample,':')[1] - try: cluster = classified_samples[sample] - except: cluster = classified_samples[sample_alt] - column_clusters.append(cluster) + if len(classified_samples)>0: + for sample in query_headers: + if ':' in sample: + sample_alt = string.split(sample,':')[1] + try: cluster = classified_samples[sample] + except: cluster = classified_samples[sample_alt] + column_clusters.append(cluster) """ Combine the two datasets, before re-ordering """ ### In case the UIDs in the two datasets are in different formats (we assume either Ensembl or Symbol) @@ -2492,16 +2753,20 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl export_object.close() """ Write out a groups file that lists samples per file for optional visualization """ - export_object = export.ExportFile(groups_dir[:-4]+'-ReOrdered.txt') + reference_query_groups = groups_dir[:-4]+'-ReOrdered.txt' + group_object = export.ExportFile(reference_query_groups) + comps_object = export.ExportFile(string.replace(reference_query_groups,'groups.','comps.')) + comps_object.write('2\t1\n') + comps_object.close() for sample in ref_headers: if ':' in sample: sample = string.split(sample,':')[1] - export_object.write(sample+'\t1\t'+ref_filename_clean+'\n') + group_object.write(sample+'\t1\t'+ref_filename_clean+'\n') for sample in query_headers: if ':' in sample: - sample = string.split(sample,':')[1] - export_object.write(sample+'\t2\t'+query_filename_clean+'\n') - export_object.close() + group_object = string.split(sample,':')[1] + group_object.write(sample+'\t2\t'+query_filename_clean+'\n') + group_object.close() """ Re-order the samples based on the classification analysis """ ### The ref_headers has the original reference sample order used to guide the query samples @@ -2518,6 +2783,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl added=[] order=0 ordering=[] + ordered_clusters=[] for sample_name in filter_names: if sample_name not in added: added.append(sample_name) @@ -2530,11 +2796,14 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl try: cluster = classified_samples[sample_name] group_export_object.write(sample_name+'\t'+cluster+'\t'+prefix+cluster+'\n') + if cluster not in ordered_clusters: + ordered_clusters.append(cluster) except Exception: pass + group_export_object.close() - sampleIndexSelection.filterFile(input_file,output_file,filter_names2) - sampleIndexSelection.filterFile(input_file,query_output_file,new_query_headers2) + sampleIndexSelection.filterFile(input_file,output_file,filter_names2,force=True) + sampleIndexSelection.filterFile(input_file,query_output_file,new_query_headers2,force=True) """Export a combined groups and comps file to perform apples-to-apples comparisons""" folds_file='' @@ -2544,7 +2813,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl final_clusters.sort() new_query_headers=[] expression_file = string.replace(groups_file,'groups.','exp.') - export_object = export.ExportFile(groups_file) + export_object = export.ExportFile(groups_file) group_counter=0 added_groups={} group_numbers={} @@ -2559,6 +2828,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl groups_to_samples={} cluster_lookup={} for (source,cluster,sample) in final_clusters: + #print source,cluster,sample;sys.exit() blahhh cluster_name = prefix+str(cluster)+'_'+source cluster_lookup[cluster_name]=prefix+str(cluster) if cluster_name in added_groups: @@ -2600,6 +2870,10 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl added=[] g2_all = len(ref_headers) g1_all = len(query_headers) + if len(query_header_proppegated_clusters) > 1000: + min_cells = 19 + else: + min_cells = 3 for (null,group1,group2) in comps: g = cluster_lookup[group1] g1_len = len(groups_to_samples[group1]) @@ -2610,8 +2884,8 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl #print g2_len, g1_len, g2_all-g2_len, g1_all-g1_len, g2_all, g1_all, pvalue added.append(group1); added.append(group2) freq_object.write(g+'\t'+str(g2_len)+'\t'+str(g1_len)+'\t'+str(pvalue)+'\t'+str(g2_frq)[:4]+'\t'+str(g1_frq)[:4]+'\n') - if added_groups[group1]>3 and added_groups[group2]>3: - ### Atleast 4 cells present in the two compared groups + if added_groups[group1]>min_cells and added_groups[group2]>min_cells: + ### Atleast 10 cells present in the two compared groups g1 = str(group_numbers[group1]) g2 = str(group_numbers[group2]) export_object.write(g1+'\t'+g2+'\n') @@ -2624,9 +2898,10 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl export_object.close() freq_object.close() try: - index1=1;index2=2; x_axis='Number of cells'; y_axis = 'Reference clusters'; title='Assigned Cell Frequencies' + index1=-2;index2=-1; x_axis='Cell-State Percentage'; y_axis = 'Reference clusters'; title='Assigned Cell Frequencies' clustering.barchart(root_dir+'/cell-frequency-stats.txt',index1,index2,x_axis,y_axis,title) except Exception: + #print traceback.format_exc() pass from stats_scripts import metaDataAnalysis @@ -2648,6 +2923,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl output_dir = root_dir+'Events-LogFold_0.1_rawp' """ if peformDiffExpAnalysis: + gene_summaries=[] if use_adjusted_pval: pval_type = '_adjp_'+str(pvalThreshold) else: @@ -2657,14 +2933,85 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl log_fold_cutoff = math.log(float(fold_cutoff),2) + """ Re-run the differential analysis comparing all cell states in the query vs. reference """ metaDataAnalysis.remoteAnalysis(species,expression_file,groups_file,platform=platform,use_custom_output_dir=use_custom_output_dir, - log_fold_cutoff=log_fold_cutoff,use_adjusted_pval=use_adjusted_pval,pvalThreshold=pvalThreshold) + log_fold_cutoff=log_fold_cutoff,use_adjusted_pval=use_adjusted_pval,pvalThreshold=pvalThreshold,suppressPrintOuts=True) gene_summary_file = output_dir+'/gene_summary.txt' - gene_summary_file2 = string.replace(gene_summary_file,use_custom_output_dir,'') - export.copyFile(gene_summary_file,gene_summary_file2) + moved_summary_file = gene_summary_file[:-4]+'-cell-states.txt' + try: os.remove(moved_summary_file) + except: pass + shutil.move(gene_summary_file,moved_summary_file) + gene_summaries.append(moved_summary_file) + + """ Re-run the differential analysis comparing all cells in the query vs. reference (not cell states) """ + metaDataAnalysis.remoteAnalysis(species,expression_file,reference_query_groups,platform=platform,use_custom_output_dir=use_custom_output_dir, + log_fold_cutoff=log_fold_cutoff,use_adjusted_pval=use_adjusted_pval,pvalThreshold=pvalThreshold,suppressPrintOuts=True) + gene_summary_file = output_dir+'/gene_summary.txt' + moved_summary_file = gene_summary_file[:-4]+'-query-reference.txt' + try: os.remove(moved_summary_file) + except: pass + shutil.move(gene_summary_file,moved_summary_file) + gene_summaries.append(moved_summary_file) + + """ Export a folds-file with the average fold differences per cell state (query vs. reference) """ + try: + folds_file = expression_file[:-4]+'-folds.txt' + comparisons = sampleIndexSelection.getComparisons(groups_file) + filter_names,group_index_db = sampleIndexSelection.getFilters(groups_file,calculateCentroids=True) + sampleIndexSelection.filterFile(expression_file,folds_file,(filter_names,group_index_db),force=False,calculateCentroids=True,comparisons=comparisons) + + export_object = export.ExportFile(string.replace(folds_file,'exp.','groups.')) + header = getHeader(folds_file) + i=0 + for h in header: + i+=1 + export_object.write(h+'\t'+str(i)+'\t'+h+'\n') + export_object.close() + export_object = export.ExportFile(string.replace(folds_file,'exp.','comps.')) + export_object.close() + except: + pass + + """ Find similar clusters """ + cellstate_DEGs = aggregateRegulatedGenes(output_dir) + clustered_groups_file = findSimilarImpactedCellStates(folds_file,cellstate_DEGs) + + if clustered_groups_file!=None: + """ Re-run the differential analysis comparing all cells in the query vs. reference (not cell states) """ + metaDataAnalysis.remoteAnalysis(species,expression_file,clustered_groups_file,platform=platform,use_custom_output_dir=use_custom_output_dir, + log_fold_cutoff=log_fold_cutoff,use_adjusted_pval=use_adjusted_pval,pvalThreshold=pvalThreshold,suppressPrintOuts=True) + gene_summary_file = output_dir+'/gene_summary.txt' + moved_summary_file = gene_summary_file[:-4]+'-clustered-states.txt' + try: os.remove(moved_summary_file) + except: pass + shutil.move(gene_summary_file,moved_summary_file) + gene_summaries.append(moved_summary_file) + + """ Clean up cellHarmony output directory """ + export.deleteFolder(root_dir+'ExpressionProfiles') + export.deleteFolder(root_dir+'PValues') + export.deleteFolder(root_dir+'top50') + + gene_summary_combined = string.replace(gene_summary_file,use_custom_output_dir,'') + combineSummaryFiles(gene_summaries,gene_summary_combined) + index1=2;index2=3; x_axis='Number of DEGs'; y_axis = 'Reference clusters'; title='cellHarmony Differentially Expressed Genes' - clustering.barchart(gene_summary_file2,index1,index2,x_axis,y_axis,title,color1='IndianRed',color2='SkyBlue') + clustering.barchart(gene_summary_combined,index1,index2,x_axis,y_axis,title,color1='IndianRed',color2='SkyBlue') + + import InteractionBuilder + print 'Generating gene regulatory networks...' + pdfs = InteractionBuilder.remoteBuildNetworks(species, output_dir) + networks_dir = root_dir+'/networks/' + try: os.mkdir(networks_dir) + except: pass + for pdf in pdfs: + file = export.findFilename(pdf) + file = string.replace(file,'AltAnalyze-network-WKT_GE.','') + file = string.replace(file,'_cellHarmony-Reference-interactions','') + shutil.copy(pdf,root_dir+'/networks/'+file) + + """ Union of all differentially expressed genes """ all_DEGs = aggregateRegulatedGenes(output_dir) display_genes = string.join(list(all_DEGs),' ') ICGS_DEGs_combined = ref_exp_db.keys() @@ -2702,32 +3049,208 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl print traceback.format_exc() print '!!!!! NO merged expression file availble for differential expression analyis (apples-to-apples).' - try: - folds_file = expression_file[:-4]+'-folds.txt' - comparisons = sampleIndexSelection.getComparisons(groups_file) - filter_names,group_index_db = sampleIndexSelection.getFilters(groups_file,calculateCentroids=True) - sampleIndexSelection.filterFile(expression_file,folds_file,(filter_names,group_index_db),force=False,calculateCentroids=True,comparisons=comparisons) - - export_object = export.ExportFile(string.replace(folds_file,'exp.','groups.')) - header = getHeader(folds_file) - i=0 - for h in header: - i+=1 - export_object.write(h+'\t'+str(i)+'\t'+h+'\n') - export_object.close() - export_object = export.ExportFile(string.replace(folds_file,'exp.','comps.')) - export_object.close() - except: - pass print 'Completed cellHarmony file creation...' return output_file, query_output_file, folds_file, all_DEGs +def combineSummaryFiles(gene_summaries,gene_summary_combined): + """ Combine the Summary output files from the differential expression analyses """ + + eo = export.ExportFile(gene_summary_combined) + header = None + for file in gene_summaries: + firstLine = True + for line in open(file,'rU').xreadlines(): + data = line.rstrip() + t = string.split(data,'\t') + if firstLine: + firstLine = False + if header == None: + header= line + eo.write(line) + else: + eo.write(line) + eo.close() + +def findSimilarImpactedCellStates(folds_file,cellstate_DEGs): + import numpy, scipy + import collections + similar_groups=collections.OrderedDict() + + folds_header_clean=[] + expression_db, folds_header = simpleExpressionFileImport(folds_file,filterUID=cellstate_DEGs) + matrix=[] + for gene in expression_db: + matrix.append(map(float,expression_db[gene])) + + matrix = numpy.array(matrix) + matrix = numpy.transpose(matrix) + + for h in folds_header: + folds_header_clean.append(string.replace(h,'-fold','')) + + output_dir = os.path.abspath(os.path.join(folds_file, os.pardir)) + pvalue_dir = output_dir+'/PValues/' + dir_list = unique.read_directory(pvalue_dir) + import UI + pval_all_db={} + genes={} + valid_comparisons=[] + for file in dir_list: + if '.txt' in file: + pval_file_dir = pvalue_dir+'/'+file + pval_db, header = simpleExpressionFileImport(pval_file_dir,filterUID=cellstate_DEGs) + for gene in pval_db: genes[gene]=[] + for h in folds_header_clean: + if h in file: + valid_comparisons.append(h) ### Use the simple name of this cluster + pval_all_db[h] = pval_db + break + + """Create a binarized matrix of p-values""" + + combined_pvalues={} + for file in valid_comparisons: + for gene in genes: + if gene in pval_all_db[file]: + if float(pval_all_db[file][gene][0])<0.1: + val=1 + else: + val=0 + else: + val=0 ### If no variance detected in either cell-state for that gene + try: combined_pvalues[gene].append(val) + except: combined_pvalues[gene]=[val] + + pval_patterns={} + for gene in combined_pvalues: + try: pval_patterns[tuple(combined_pvalues[gene])]+=1 + except: pval_patterns[tuple(combined_pvalues[gene])]=1 + patterns_ranked=[] + for pattern in pval_patterns: + count = pval_patterns[pattern] + s = sum(pattern) + if s == 0 or s==1 or s==len(valid_comparisons): + continue + else: + patterns_ranked.append([count,pattern]) + patterns_ranked.sort() + patterns_ranked.reverse() + + new_aggregate_clusters=[] + for (count,pattern) in patterns_ranked[:4]: ### restrict to the top 4 patterns + pattern_groups=[] + if count > 9: ### require that atleast 10 genes have that pattern + i=0 + for cluster in pattern: + if cluster == 1: + pattern_groups.append(valid_comparisons[i]) + i+=1 + new_aggregate_clusters.append(pattern_groups) + + """ + index=0 + prior_cell_state_array=[] + cluster=1 + for cell_state_array in matrix: + if len(prior_cell_state_array)>0: + rho,p = scipy.stats.pearsonr(cell_state_array,prior_cell_state_array) + #print cluster, header[index], rho + if rho>0.3: + if cluster in similar_groups: + similar_groups[cluster].append(header[index]) + else: + similar_groups[cluster]=[header[index-1],header[index]] + else: + if len(similar_groups)==0: + ### occurs for the first group if not similar to the next + similar_groups[cluster] = [header[index-1]] + cluster+=1 + similar_groups[cluster] = [header[index]] + prior_cell_state_array = cell_state_array + index+=1 + + num_new_clusters=0 + if len(similar_groups)!=len(folds_header): ### If not clusters combined + exp_file = string.replace(folds_file,'-folds.txt','.txt') + groups_file = string.replace(exp_file,'exp.','groups.') + output_groups_file = string.replace(groups_file,'.txt','-clusters.txt') + output_comps_file = string.replace(output_groups_file,'groups.','comps.') + + og = export.ExportFile(output_groups_file) + oc = export.ExportFile(output_comps_file) + + import ExpressionBuilder + sample_list,group_sample_db,group_db,group_name_sample_db,comp_groups,comps_name_db = ExpressionBuilder.simpleGroupImport(groups_file,reverseOrder=True) + + for cluster in similar_groups: + if len(similar_groups[cluster])>1: + ### Combine the cells from different groups + exp_cells=[] + control_cells=[] + for group in similar_groups[cluster]: + exp_group = string.replace(group,'-fold','') + exp_cells += group_name_sample_db[exp_group] + control_group = comps_name_db[exp_group][1] + control_cells += group_name_sample_db[control_group] + cluster_number = cluster*2-1 + if len(similar_groups[cluster]) != len(folds_header): ### If ALL clusters combined + num_new_clusters+=1 + cluster_name = string.replace(string.join(similar_groups[cluster],'|'),'-fold','') + for cell in exp_cells: + og.write(cell+'\t'+str(cluster_number)+'\t'+cluster_name+'-Query'+'\n') + for cell in control_cells: + og.write(cell+'\t'+str(cluster_number+1)+'\t'+cluster_name+'-Ref'+'\n') + oc.write(str(cluster_number)+'\t'+str(cluster_number+1)+'\n') + og.close() + oc.close() + """ + + num_new_clusters=0 + if len(new_aggregate_clusters)>0: ### If not clusters combined + exp_file = string.replace(folds_file,'-folds.txt','.txt') + groups_file = string.replace(exp_file,'exp.','groups.') + output_groups_file = string.replace(groups_file,'.txt','-clusters.txt') + output_comps_file = string.replace(output_groups_file,'groups.','comps.') + + og = export.ExportFile(output_groups_file) + oc = export.ExportFile(output_comps_file) + + import ExpressionBuilder + sample_list,group_sample_db,group_db,group_name_sample_db,comp_groups,comps_name_db = ExpressionBuilder.simpleGroupImport(groups_file,reverseOrder=True) + + for clustered_groups in new_aggregate_clusters: + + ### Combine the cells from different groups + exp_cells=[] + control_cells=[] + for exp_group in clustered_groups: + exp_cells += group_name_sample_db[exp_group] + control_group = comps_name_db[exp_group][1] + control_cells += group_name_sample_db[control_group] + num_new_clusters+=1 + cluster_name = string.replace(string.join(clustered_groups,'|'),'-fold','') + for cell in exp_cells: + og.write(cell+'\t'+str(num_new_clusters)+'\t'+cluster_name+'-Query'+'\n') + for cell in control_cells: + og.write(cell+'\t'+str(num_new_clusters+1)+'\t'+cluster_name+'-Ref'+'\n') + oc.write(str(num_new_clusters)+'\t'+str(num_new_clusters+1)+'\n') + num_new_clusters = num_new_clusters*2 + og.close() + oc.close() + + if num_new_clusters>0: + return output_groups_file + else: + return None + def aggregateRegulatedGenes(folder): + """ Find the best representative comparison for each gene + dependent on the observed rawp """ import UI - geneIDs=[] + geneIDs={} files = UI.read_directory(folder) + for file in files: - comp_geneIDs=[] if '.txt' in file and ('PSI.' in file or 'GE.' in file): ls=[] fn = folder+'/'+file @@ -2740,19 +3263,35 @@ def aggregateRegulatedGenes(folder): uid_index = t.index('Event-Direction') fold_index = t.index('dPSI') else: - uid_index = t.index('Symbol') + uid_index = t.index('GeneID') + symbol_index = t.index('Symbol') fold_index = t.index('LogFold') + p_index = t.index('rawp') firstLine= False continue uid = t[uid_index] + symbol = t[symbol_index] + pval = float(t[p_index]) fold = t[fold_index] - comp_geneIDs.append((fold,uid)) - comp_geneIDs.sort() ### Sort by fold - comp_geneIDs.reverse() - for (fold,uid) in comp_geneIDs: - if uid not in geneIDs: ### Remove this to inlcude redundant genes - if len(uid)>0: - geneIDs.append(uid) + if '-' in fold: + direction = 'negative' + else: + direction = 'positive' + try: + geneIDs[uid].append([pval,direction,file]) + except: + geneIDs[uid] = [[pval,direction,file]] + if symbol != uid: + """ Support both ID systems for different databases """ + try: + geneIDs[symbol].append([pval,direction,file]) + except: + geneIDs[symbol] = [[pval,direction,file]] + + for uid in geneIDs: + hits = geneIDs[uid] + hits.sort() + geneIDs[uid] = hits[0] return geneIDs def convertFromEnsemblToSymbol(exp_db,gene_to_symbol): @@ -2785,8 +3324,19 @@ def checkForGroupsFile(filename,headers): pass return headers, new_headers, groups_db -def importExpressionFile(input_file,ignoreClusters=False,filterIDs=False): +def importExpressionFile(input_file,ignoreClusters=False,filterIDs=False,customLabels=None): """ Import the expression value and harmonize (cellHarmony) to log, non-fold values """ + + if customLabels!=None: + try: + import ExpressionBuilder + customLabels = ExpressionBuilder.simplerGroupImport(customLabels) + except: + print 'WARNING!!! Custom labels failed to import due to formatting error.' + customLabels={} + else: + customLabels={} + import collections expression_db=collections.OrderedDict() column_cluster_index=collections.OrderedDict() @@ -2833,6 +3383,13 @@ def importExpressionFile(input_file,ignoreClusters=False,filterIDs=False): i=0 for header in header_row: cluster = clusters[i] + """ If labels provided by the user """ + try: cluster = customLabels[header] + except: + try: + h=string.split(header,':')[1] + cluster = customLabels[h] + except: pass column_cluster_index[header]=cluster i+=1 @@ -2885,20 +3442,258 @@ def importExpressionFile(input_file,ignoreClusters=False,filterIDs=False): print len(expression_db),'IDs imported' return expression_db, header_row, column_cluster_index, cluster_format_file + +def createMetaICGSAllCells(ICGS_files,outputDir,CenterMethod='median', + species='Hs',platform='RNASeq',PearsonThreshold=0.9): + """This function combines ICGS or MarkerFinder results from multiple outputs + after createMetaICGSResults has been run""" + + """ Rename the Expanded CellHarmonyReference files based on the dataset """ + files_to_merge, all_cells, groups_db = renameICGSfiles(ICGS_files,CenterMethod,ReturnAllCells=True) + final_output_dir = outputDir+'/CellHarmonyReference/ICGS-merged-reference.txt' + genes, ordered_barcodes, ordered_clusters = importMergedICGS(final_output_dir,outputDir,groups_db,CenterMethod) + + join_option = 'Intersection' + ID_option = False ### Require unique-matches only when True + output_merge_dir = outputDir + + # output a combined centroid file, combining all input ICGS or MarkerFinder results with raw expression values + from import_scripts import mergeFiles + + """ Merge the full cell matrix results for the union of ICGS-NMF genes """ + outputfile=output_merge_dir+'/MergedFiles.txt' + outputfile = mergeFiles.joinFiles(files_to_merge, join_option, ID_option, output_merge_dir) # Comment out if already run + revised_outputfile = output_merge_dir+'/ICGS-merged-all-temp.txt' + + from import_scripts import sampleIndexSelection + print 'Reorganizing the full-cell merged-ICGS results' + sampleIndexSelection.filterFile(outputfile,revised_outputfile,ordered_barcodes) # Comment out if already run + + """ Re-output the ordered cell expression matrix with ordered genes in heatmap format """ + exportFullMergedMarkerFile(revised_outputfile,genes,ordered_clusters) + +def exportFullMergedMarkerFile(filename,genes,ordered_clusters): + """ Re-order the genes in the genes dictionary order""" + reordered_expression_file = filename[:-4]+'-sorted' + reordered_expression_file = exportSorted(filename, genes) # Comment out if already run + + output = filename[:-4]+'-temp.txt' + eo = export.ExportFile(output) + rowNumber=1 + for line in open(reordered_expression_file,'rU').xreadlines(): + data = line.rstrip() + t = string.split(data,'\t') + if rowNumber==1: + header = t[1:] + eo.write(string.join(['UID','row_clusters-flat']+t[1:],'\t')+'\n') + elif rowNumber==2: + eo.write(string.join(['column_clusters-flat','']+ordered_clusters,'\t')+'\n') + else: + cluster = str(genes[t[0]]) + eo.write(string.join([t[0],cluster]+t[1:],'\t')+'\n') + rowNumber+=1 + eo.close() + print 'Merged-ICGS all-cells file exported to:',output + +def importMergedICGS(final_output_dir,outputDir,groups_db,CenterMethod): + """ Reimport the final ICGS centroids and genes for merging the all cell results """ + expression_db = {} + rowNumber = 1 + import collections + genes=collections.OrderedDict() + for line in open(final_output_dir,'rU').xreadlines(): + data = cleanUpLine(line) + t = string.split(data,'\t') + if rowNumber==1: + original_cluster_names = t[2:] + elif rowNumber==2: + clusters = t[2:] + else: + genes[t[0]]=t[1] ### Store the gene and cluster number + rowNumber+=1 + + index = 0 + cluster_db = collections.OrderedDict() + ordered_barcodes=[] + ordered_clusters=[] + names_added=[] + cluster_names=[] + for original_name in original_cluster_names: + names = string.split(original_name,'|') + cluster = clusters[index] + barcodes=[] + for name in names: + if name in names_added: + continue ### Looks like a bug or issue with how the data is loaded + names_added.append(name) + name = string.replace(name,'-'+CenterMethod,'') + ordered_barcodes+=groups_db[name] + ordered_clusters+=[cluster]*len(groups_db[name]) + cluster_names+=[original_name]*len(groups_db[name]) + cluster_db[cluster]=barcodes + index+=1 + + eo = export.ExportFile(outputDir+'/groups.all-cell-ICGS.txt') + i=0 + for b in ordered_barcodes: + eo.write(b+'\t'+ordered_clusters[i]+'\t'+cluster_names[i]+'\n') + i+=1 + eo.close() + + return genes, ordered_barcodes, ordered_clusters + +def exportSorted(filename, uids, sort_col=0, excludeHeader=True): + ### efficient method to sort a big file without storing everything in memory + ### http://stackoverflow.com/questions/7079473/sorting-large-text-data + ouput_file = filename[:-4]+'-sorted' ### temporary + index = [] + f = open(filename) + index_db={} + firstLine = True + while True: + offset = f.tell() + line = f.readline() + if not line: break + length = len(line) + col = line.split('\t')[sort_col].strip() + + if firstLine: + header = line + firstLine = False + if excludeHeader == False: + index.append((col, offset, length)) + else: + if col in uids: + #index.append((col, offset, length)) + index_db[col] = (col, offset, length) + f.close() + + for uid in uids: + if uid in index_db: + index.append(index_db[uid]) ### Order to seek to + + o = open(ouput_file,'w') + f = open(filename) + if excludeHeader: + o.write(header) + for col, offset, length in index: + #print col, offset, length + f.seek(offset) + o.write(f.read(length)) + o.close() + """ + try: + ### Error occurs when the file can't be deleted due to system permissions + os.remove(filename) + os.rename(ouput_file,filename) + return filename + except Exception: + return ouput_file + """ + print filename,'...sorted' + return ouput_file + +def retreive_groups_from_file(filename,groups_db): + root_dir = os.path.abspath(os.path.join(filename, os.pardir)) + root_dir = os.path.abspath(os.path.join(root_dir[:-1], os.pardir)) + dataset= os.path.basename(root_dir) + fn=filepath(filename) + expression_db = {} + rowNumber = 1 + for line in open(fn,'rU').xreadlines(): + data = cleanUpLine(line) + t = string.split(data,'\t') + if rowNumber==1: + header = t[2:] + import collections + clusters = collections.OrderedDict() + cluster_number=1 + for h in header: + cluster,barcode = string.split(h,':') + if cluster in clusters: + cn = clusters[cluster] + groups_db[str(cn)+'.'+dataset].append(barcode+'.'+dataset) + else: + clusters[cluster]=cluster_number + groups_db[str(cluster_number)+'.'+dataset]=[barcode+'.'+dataset] + cluster_number+=1 + firstRow = False + else: + break + rowNumber+=1 + + ### Verify that there is not redundancy + temp={} + for cluster in groups_db: + for barcode in groups_db[cluster]: + if barcode in temp: + print barcode, temp[barcode], cluster + print 'Warning!!!!! Redundant barcodes by merged';sys.exit() + temp[barcode]=cluster + + return groups_db + +def renameICGSfiles(ICGS_files,CenterMethod,ReturnAllCells=False): + """ Rename the input files """ + files_to_merge = [] + all_cells={} + import collections + groups_db = collections.OrderedDict() + for heatmap_file in ICGS_files: + root_dir = os.path.abspath(os.path.join(heatmap_file, os.pardir)) + if 'ICGS' in root_dir: + print 'Importing prior produced ICGS-results from:',root_dir + root_dir = os.path.abspath(os.path.join(root_dir[:-1], os.pardir)) + cellHarmonyReferenceFile = root_dir+'/CellHarmonyReference/MarkerFinder-cellHarmony-reference-centroid.txt' + cellHarmonyReferenceFileFull = root_dir+'/CellHarmonyReference/MarkerFinder-cellHarmony-reference.txt' + dataset= os.path.basename(root_dir) + src=cellHarmonyReferenceFile + dst=root_dir+'/CellHarmonyReference/'+dataset+'-'+CenterMethod+'.txt' + try: shutil.copyfile(src,dst) # Coment out if already run + except: pass + if ReturnAllCells == False: + files_to_merge.append(dst) -def createMetaICGSResults(ICGS_files,outputDir,CenterMethod='median',species='Hs',PearsonThreshold=0.9): + src=cellHarmonyReferenceFileFull + dst=root_dir+'/CellHarmonyReference/'+dataset+'.txt' + try: shutil.copyfile(src,dst) # Coment out if already run + except: pass + all_cells[dataset+'-'+CenterMethod]=dst + groups_db = retreive_groups_from_file(heatmap_file,groups_db) + if ReturnAllCells: + files_to_merge.append(dst) + + return files_to_merge, all_cells, groups_db + +def createMetaICGSResults(ICGS_files,outputDir,CenterMethod='median', + species='Hs',platform='RNASeq',PearsonThreshold=0.9,force=True): """This function combines ICGS or MarkerFinder results from multiple outputs""" - # import all ICGS or MarkerFinder variable genes - gene_db = simpleICGSGeneImport(ICGS_files) + #### If you only want to merge existing results, run this + #createMetaICGSAllCells(ICGS_files,outputDir,CenterMethod=CenterMethod, + # species=species,platform=platform,PearsonThreshold=PearsonThreshold) + #sys.exit() + # import raw expression values for each ICGS or MarkerFinder files_to_merge = [] + all_cells={} + # Check to see if the output already exist - for heatmap_file in ICGS_files: - ### returnCentroids = False to return all cells - cellHarmonyReferenceFile = convertICGSClustersToExpression(heatmap_file,heatmap_file,returnCentroids=True, - CenterMethod=CenterMethod,geneOverride=gene_db,combineFullDatasets=False,species=species) - files_to_merge.append(cellHarmonyReferenceFile) + # If not, create the MarkerFinder-cellHarmony-reference with the union of all genes + #if len(files_to_merge) != len(ICGS_files) or force==True: + if force==True: + + # import all ICGS or MarkerFinder variable genes + gene_db = simpleICGSGeneImport(ICGS_files) + print len(gene_db), 'unique genes from ICGS results imported...' + for heatmap_file in ICGS_files: + ### returnCentroids = False to return all cells + cellHarmonyReferenceFile = convertICGSClustersToExpression(heatmap_file,heatmap_file,returnCentroids=True, + CenterMethod=CenterMethod,geneOverride=gene_db,combineFullDatasets=False,species=species) + files_to_merge.append(cellHarmonyReferenceFile) + + """ Rename the Expanded CellHarmonyReference files based on the dataset """ + files_to_merge, all_cells, groups_db = renameICGSfiles(ICGS_files,CenterMethod) join_option = 'Intersection' ID_option = False ### Require unique-matches only when True @@ -2909,42 +3704,98 @@ def createMetaICGSResults(ICGS_files,outputDir,CenterMethod='median',species='Hs outputfile = mergeFiles.joinFiles(files_to_merge, join_option, ID_option, output_merge_dir) # collapse similar medoids into centroids to remove redundant references - query_output_file = collapseSimilarMedoids(outputfile,cutoff=PearsonThreshold) - + query_output_file, unclustered_collapsed = collapseSimilarMedoids(outputfile,cutoff=PearsonThreshold) + # re-cluster this merged file with HOPACH to produce the final combined medoid reference from visualization_scripts import clustering - row_method = 'hopach'; row_metric = 'correlation'; column_method = 'hopach'; column_metric = 'cosine'; color_gradient = 'yellow_black_blue' + row_method = None; row_metric = 'correlation'; column_method = None; column_metric = 'cosine'; color_gradient = 'yellow_black_blue' transpose = False; Normalize=False + graphics = clustering.runHCexplicit(query_output_file, [], row_method, row_metric, column_method, column_metric, color_gradient, transpose, Normalize=Normalize, contrast=3, display=False) - + print 'Completed clustering' revised_cellHarmony_reference = graphics[-1][-1][:-4]+'.txt' final_output_dir = outputDir+'/CellHarmonyReference/ICGS-merged-reference.txt' - exportMergedReference(revised_cellHarmony_reference,final_output_dir) + exportMergedReference(unclustered_collapsed,revised_cellHarmony_reference,final_output_dir,outputDir,species,platform) + + createMetaICGSAllCells(ICGS_files,outputDir,CenterMethod=CenterMethod, + species=species,platform=platform,PearsonThreshold=PearsonThreshold) return final_output_dir -def exportMergedReference(input,output): +def exportMergedReference(unclustered_centroids,input,output,outputDir,species,platform): + import UI + fl = UI.ExpressionFileLocationData('','','','') + fl.setOutputDir(outputDir) + fl.setSpecies(species) + fl.setPlatformType(platform) + fl.setRPKMThreshold(0.00) + fl.setCorrelationDirection('up') + compendiumType = 'protein_coding' + genesToReport = 50 + correlateAll = True + import markerFinder + print [unclustered_centroids] + print [input] + print 'Running MarkerFinder' + markerFinder.analyzeData(unclustered_centroids,species,platform,compendiumType, + geneToReport=genesToReport,correlateAll=correlateAll,AdditionalParameters=fl, + logTransform=False) + + markerfinder_dir= outputDir+'CellHarmonyReference/MarkerFinder/AllGenes_correlations-ReplicateBased.txt' + import collections + marker_db = collections.OrderedDict() + + def importMarkerFinderResults(input_file): + header_row=True + for line in open(input_file,'rU').xreadlines(): + data = cleanUpLine(line) + data = string.replace(data,'"','') + geneID, symbol, rho, p, cell_state = string.split(data,'\t') + if header_row: + header_row=False + else: + rho = float(rho) + if cell_state in marker_db: + if len(marker_db[cell_state])>49: + continue + elif rho>0.2: + marker_db[cell_state].append(geneID) + else: + marker_db[cell_state] = [geneID] + + importMarkerFinderResults(markerfinder_dir) + row_count=0 eo = export.ExportFile(output) - eo.write(string.join([],'\t')+'') + clustered_genes={} for line in open(input,'rU').xreadlines(): row_count+=1 data = cleanUpLine(line) t = string.split(data,'\t') if row_count==1: clusters = len(t)-1 - eo.write(string.join(['UID','row_clusters-flat']+map(str,range(1,clusters)),'\t')+'\n') + #eo.write(string.join(['UID','row_clusters-flat']+map(str,range(1,clusters)),'\t')+'\n') + headers = t[2:] + eo.write(line) elif row_count==2: clusters = len(t)-1 eo.write(string.join(['column_clusters-flat','']+map(str,range(1,clusters)),'\t')+'\n') else: - eo.write(line) + #eo.write(line) + clustered_genes[t[0]]=t[2:] + + clusters=1 + for cell_state in headers: + if cell_state in marker_db: + for gene in marker_db[cell_state]: + eo.write(string.join([gene,str(clusters)]+clustered_genes[gene],'\t')+'\n') + clusters+=1 eo.close() def collapseSimilarMedoids(outputfile,cutoff=0.9): - print 'correlation cutoff =',cutoff + print 'correlation cutoff =',[cutoff] from visualization_scripts import clustering import numpy from stats_scripts import statistics @@ -3032,8 +3883,22 @@ def collapseSimilarMedoids(outputfile,cutoff=0.9): print group_index_db[cluster];sys.exit() eo.write(string.join([row_header[i]]+avg_matrix,'\t')+'\n') i+=1 - - return collapsed_dir + eo.close() + + # Copy to a file for MarkerFinder analysis + unclustered_collapsed = string.replace(collapsed_dir,'collapsedMedoids.txt','exp.unclustered.txt') + try: shutil.copyfile(collapsed_dir,unclustered_collapsed) + except: pass + + eo = export.ExportFile(string.replace(unclustered_collapsed,'exp.','groups.')) + number=1 + for cluster in group_index_db: + eo.write(cluster+'\t'+str(number)+'\t'+cluster+'\n') + number+=1 + eo.close() + eo = export.ExportFile(string.replace(unclustered_collapsed,'exp.','comps.')) + eo.close() + return collapsed_dir, unclustered_collapsed def convertICGSClustersToExpression(heatmap_file,query_exp_file,returnCentroids=False, CenterMethod='median',geneOverride=None,combineFullDatasets=True,species='Hs'): @@ -3292,7 +4157,7 @@ def convertICGSClustersToExpression(heatmap_file,query_exp_file,returnCentroids= print 'Using invidual cells rather than cell centroids for alignment.' return cellHarmonyReferenceFile -def simpleExpressionFileImport(filename): +def simpleExpressionFileImport(filename,filterUID={}): fn=filepath(filename) expression_db = {} firstRow = True @@ -3303,6 +4168,9 @@ def simpleExpressionFileImport(filename): header = t[1:] firstRow = False else: + if len(filterUID)>0: + if t[0] not in filterUID: + continue expression_db[t[0]] = t[1:] return expression_db, header @@ -3371,6 +4239,57 @@ def compareICGSpopulationFrequency(folder): ea.close() if __name__ == '__main__': + import UI + + folds_file = '/Users/saljh8/Desktop/DemoData/cellHarmony/Mouse_BoneMarrow/inputFile/cellHarmony/exp.ICGS-cellHarmony-reference__AML-AllCells-folds.txt' + output = '/data/salomonis2/LabFiles/TabulaMuris/10x-GSE109774_RAW/all/cellHarmony/' + output_file = '/data/salomonis2/LabFiles/TabulaMuris/10x-GSE109774_RAW/all/cellHarmony/exp.MarkerFinder-cellHarmony-reference__12_tissues-CellOntologyCommon-ReOrdered.txt' + species = 'Mm' + ### Build-UMAP plot + import UI + import warnings + warnings.filterwarnings('ignore') + try: + try: os.mkdir(output+'/UMAP-plots') + except: pass + """ Output UMAP combined plot colored by reference and query cell identity """ + plot = UI.performPCA(output_file, 'no', 'UMAP', False, None, plotType='2D', + display=False, geneSetName=None, species=species, zscore=True, reimportModelScores=False, + separateGenePlots=False, returnImageLoc=True) + plot = plot[-1][-1][:-4]+'.pdf' + shutil.copy(plot,output+'/UMAP-plots/UMAP-query-vs-ref.pdf') + + """ Output UMAP combined plot colored by cell tates """ + plot = UI.performPCA(output_file, 'no', 'UMAP', False, None, plotType='2D', + display=False, geneSetName=None, species='Mm', zscore=True, reimportModelScores=True, + separateGenePlots=False, returnImageLoc=True, forceClusters=True) + plot = plot[-1][-1][:-4]+'.pdf' + shutil.copy(plot,output+'/UMAP-plots/UMAP-query-vs-ref-clusters.pdf') + + """ Output individual UMAP plots colored by cell tates """ + groups_file = string.replace(output_file,'exp.','groups.') + plots = UI.performPCA(output_file, 'no', 'UMAP', False, None, plotType='2D', + display=False, geneSetName=None, species='Mm', zscore=True, reimportModelScores=True, + separateGenePlots=False, returnImageLoc=True, forceClusters=True, maskGroups=groups_file) + for plot in plots: + plot = plot[-1][:-4]+'.pdf' + + if '-cellHarmony-Reference-' in plot: + shutil.copy(plot,output+'/UMAP-plots/UMAP-ref-clusters.pdf') + else: + shutil.copy(plot,output+'/UMAP-plots/UMAP-query-clusters.pdf') + except ZeroDivisionError: + pass + sys.exit() + fl = UI.ExpressionFileLocationData(folds_file,'','',''); species='Mm'; platform = 'RNASeq' + fl.setSpecies(species); fl.setVendor(platform) + fl.setOutputDir(output) + DEGs_combined = aggregateRegulatedGenes('/Users/saljh8/Desktop/DemoData/cellHarmony/Mouse_BoneMarrow/inputFile/cellHarmony/DifferentialExpression_Fold_2.0_adjp_0.05') + clustered_groups_file = findSimilarImpactedCellStates(folds_file,DEGs_combined) + sys.exit() + exportPvalueRankedGenes(species,platform,fl,folds_file,DEGs_combined) + sys.exit() + icgs_dir = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/ICGS/Clustering-exp.NaturePanorma-Ly6G-Guide3-Augment-F2r-hierarchical_cosine_correlation.txt' exp_dir = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Ly6g/ExpressionInput/exp.Guide3-cellHarmony-revised.txt' #convertICGSClustersToExpression(icgs_dir,exp_dir);sys.exit() @@ -3379,6 +4298,31 @@ def compareICGSpopulationFrequency(folder): reference_exp_file = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/cellHarmony-evaluation/Grimes/exp.WT.txt' query_exp_file = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/cellHarmony-evaluation/Grimes/exp.AML.txt' classification_file= '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/cellHarmony-evaluation/Grimes/CellClassification/AML-CellClassification.txt' + folds_file = '/Users/saljh8/Desktop/DemoData/cellHarmony/Mouse_BoneMarrow/inputFile/cellHarmony/exp.ICGS-cellHarmony-reference__AML-AllCells-folds.txt' + output = '/Users/saljh8/Desktop/DemoData/cellHarmony/Mouse_BoneMarrow/inputFile/cellHarmony/' + species = 'Mm' + array_type = 'RNASeq' + parent_dir = output+'/DifferentialExpression_Fold_1.5_rawp_0.05/' + dir_list = unique.read_directory(parent_dir) + import UI + for file in dir_list: + input_file_dir = parent_dir+'/'+file + inputType = 'IDs' + interactionDirs = ['WikiPathways','KEGG','BioGRID','TFTargets'] + output_dir = parent_dir + degrees = 'direct' + input_exp_file = input_file_dir + gsp = UI.GeneSelectionParameters(species,array_type,array_type) + gsp.setGeneSet('None Selected') + gsp.setPathwaySelect('') + gsp.setGeneSelection('') + gsp.setOntologyID('') + gsp.setIncludeExpIDs(True) + UI.networkBuilder(input_file_dir,inputType,output_dir,interactionDirs,degrees,input_exp_file,gsp,'') + sys.exit() + DEGs_combined = aggregateRegulatedGenes(output+'/DifferentialExpression_Fold_1.5_rawp_0.05') + exportCombinedMarkerFinderResults(folds_file,output+'/MarkerFinder-positive',output+'/MarkerFinder-negative',DEGs_combined,species) + sys.exit() harmonizeClassifiedSamples('Hs',reference_exp_file,query_exp_file,classification_file);sys.exit() #modelScores('/Users/saljh8/Desktop/dataAnalysis/LineageProfiler/Training/CellClassification');sys.exit() diff --git a/RNASeq.py b/RNASeq.py index 657cd71..4f53058 100755 --- a/RNASeq.py +++ b/RNASeq.py @@ -2789,13 +2789,16 @@ def importBiologicalRelationships(species): except Exception: pass return custom_annotation_dbase -def importGeneSets(geneSetType,filterType=None,geneAnnotations=None): +def importGeneSets(geneSetType,filterType=None,geneAnnotations=None,speciesName=None): + try: speciesName = species + except: pass + gene_db={} if 'Ontology' in geneSetType: - filename = 'AltDatabase/goelite/'+species+'/nested/Ensembl_to_Nested-GO.txt' + filename = 'AltDatabase/goelite/'+speciesName+'/nested/Ensembl_to_Nested-GO.txt' ontology=True else: - filename = 'AltDatabase/goelite/'+species+'/gene-mapp/Ensembl-'+geneSetType+'.txt' + filename = 'AltDatabase/goelite/'+speciesName+'/gene-mapp/Ensembl-'+geneSetType+'.txt' ontology=False fn=filepath(filename) for line in open(fn,'rU').xreadlines(): @@ -2813,7 +2816,7 @@ def importGeneSets(geneSetType,filterType=None,geneAnnotations=None): gene_db[gene]=[] return gene_db -def singleCellRNASeqWorkflow(Species, platform, expFile, mlp, exp_threshold=5, rpkm_threshold=5, drivers=False, parameters = None, reportOnly=False): +def singleCellRNASeqWorkflow(Species, platform, expFile, mlp, exp_threshold=0, rpkm_threshold=5, drivers=False, parameters = None, reportOnly=False): global species global rho_cutoff species = Species diff --git a/UI.py b/UI.py index 5c9bf64..c6d6a1f 100755 --- a/UI.py +++ b/UI.py @@ -514,7 +514,7 @@ def preProcessRNASeq(species,exp_file_location_db,dataset,mlp_instance,root): genome = export.findFilename(matrix_dir[:-1]) parent_dir = export.findParentDir(matrix_dir[:-1]) from import_scripts import ChromiumProcessing - ChromiumProcessing.import10XSparseMatrix(parent_dir,genome,dataset,expFile=expFile) + ChromiumProcessing.import10XSparseMatrix(matrix_file,genome,dataset,expFile=expFile) except Exception: print 'Chromium export failed due to:',traceback.format_exc() try: root.destroy() @@ -1041,12 +1041,40 @@ def runLineageProfiler(fl, expr_input_dir, vendor, custom_markerFinder, geneMode print '****Running cellHarmony****' codingtype = 'exon'; compendium_platform = 'exon' platform = array_type,vendor - try: LineageProfilerIterate.runLineageProfiler(species,platform,expr_input_dir,expr_input_dir,codingtype,compendium_platform,customMarkers=custom_markerFinder,geneModels=geneModel,modelSize=modelSize,fl=fl) - except Exception: - print_out = traceback.format_exc() - try: InfoWindow(print_out, 'Continue') ### Causes an error when peforming heatmap visualizaiton - except Exception: None - print_out = 'LineageProfiler classification results saved to the folder "CellClassification".' + + try: returnCentroids = fl.ReturnCentroids() + except Exception: returnCentroids = 'community' + + if returnCentroids == 'community': + """ Match cells between datasets according to their similar clusters defined by + community clustering of the two independent dataset network (same ICGS genes) """ + from stats_scripts import cellHarmony + try: output_dir = fl.OutputDir() + except: + output_dir = os.path.abspath(os.path.join(expr_input_dir, os.pardir)) + if 'ExpressionInput' in output_dir: + output_dir = string.replace(output_dir,'ExpressionInput','') + + try: + print 'Running community cellHarmony analysis' + cellHarmony.manage_louvain_alignment(species,platform,expr_input_dir,output_dir, + customMarkers=custom_markerFinder,useMulti=False,fl=fl) + except ZeroDivisionError: + print_out = traceback.format_exc() + try: InfoWindow(print_out, 'Continue') ### Causes an error when peforming heatmap visualizaiton + except Exception: None + else: + """ Directly match cells between datasets either all cells by all cells or cells + by centroids (same ICGS genes) """ + try: LineageProfilerIterate.runLineageProfiler(species,platform,expr_input_dir,expr_input_dir, + codingtype,compendium_platform,customMarkers=custom_markerFinder, + geneModels=geneModel,modelSize=modelSize,fl=fl) + except Exception: + print_out = traceback.format_exc() + try: InfoWindow(print_out, 'Continue') ### Causes an error when peforming heatmap visualizaiton + except Exception: None + + print_out = 'cellHarmony classification results saved to the folder "CellClassification".' if root!=None and root!='': try: openDirectory(export.findParentDir(expr_input_dir)+'/cellHarmony') except Exception: None @@ -1061,7 +1089,7 @@ def runLineageProfiler(fl, expr_input_dir, vendor, custom_markerFinder, geneMode def performPCA(filename, pca_labels, pca_algorithm, transpose, root, plotType='3D',display=True, geneSetName=None, species=None, zscore=True, colorByGene=None, reimportModelScores=True, - separateGenePlots=False, returnImageLoc=False): + separateGenePlots=False, returnImageLoc=False, forceClusters=False, maskGroups=None): from visualization_scripts import clustering; reload(clustering) graphics = [] if pca_labels=='yes' or pca_labels=='true'or pca_labels=='TRUE': pca_labels=True @@ -1073,7 +1101,7 @@ def performPCA(filename, pca_labels, pca_algorithm, transpose, root, plotType='3 pca_graphical_links = clustering.runPCAonly(filename, graphics, transpose, showLabels=pca_labels, plotType=plotType,display=display, algorithm=pca_algorithm, geneSetName=geneSetName, species=species, zscore=zscore, colorByGene=colorByGene, reimportModelScores=reimportModelScores, - separateGenePlots=separateGenePlots) + separateGenePlots=separateGenePlots, forceClusters=forceClusters, maskGroups=maskGroups) try: print'Finished building exporting plot.' except Exception: None ### Windows issue with the Tk status window stalling after pylab.show is called except Exception: @@ -4455,13 +4483,19 @@ def UseAdjPvalue(self): return False else: return True + def setLabels(self,labels): self.labels = labels + def Labels(self): return self.labels def setFoldCutoff(self, foldCutoff): self.foldCutoff = foldCutoff def FoldCutoff(self): return self.foldCutoff def setPvalThreshold(self, pvalThreshold): self.pvalThreshold = pvalThreshold def PvalThreshold(self): return self.pvalThreshold def setPeformDiffExpAnalysis(self, peformDiffExpAnalysis): self.peformDiffExpAnalysis = peformDiffExpAnalysis def PeformDiffExpAnalysis(self): - if string.lower(self.peformDiffExpAnalysis)=='false' or string.lower(self.peformDiffExpAnalysis)=='no' or self.peformDiffExpAnalysis==False: + if self.peformDiffExpAnalysis==False: + return False + if self.peformDiffExpAnalysis==True: + return True + if string.lower(self.peformDiffExpAnalysis)=='false' or string.lower(self.peformDiffExpAnalysis)=='no': return False else: return True @@ -4797,6 +4831,8 @@ def checkForLocalArraySupport(species,array_type,specific_arraytype,run_mode): if array_type == 'junction' or array_type == 'RNASeq': try: gene_database = unique.getCurrentGeneDatabaseVersion() except Exception: gene_database='00' + try: int(gene_database[-2:]) + except: gene_database='00' if int(gene_database[-2:]) < 0: print_out = 'The AltAnalyze database indicated for '+array_type+' analysis\n is not supported for alternative exon analysis.\nPlease update to EnsMart55 or greater before\nproceeding.' if run_mode == 'GUI': IndicatorWindow(print_out,'Continue') @@ -5659,6 +5695,7 @@ def rebootAltAnalyzeGUI(selected_parameters,user_variables): useAdjPval = gu.Results()['UseAdjPval'] pvalThreshold = gu.Results()['pvalThreshold'] foldCutoff = gu.Results()['FoldCutoff'] + labels = gu.Results()['labels'] if '.png' in markerFinder_file or '.pdf' in markerFinder_file: markerFinder_file=markerFinder_file[:-4]+'.txt' if len(geneModel_file) == 0: geneModel_file = None @@ -5675,6 +5712,7 @@ def rebootAltAnalyzeGUI(selected_parameters,user_variables): fl.setUseAdjPvalue(useAdjPval) fl.setPvalThreshold(pvalThreshold) fl.setFoldCutoff(foldCutoff) + fl.setLabels(labels) """ print fl.PeformDiffExpAnalysis() print fl.CompendiumType() @@ -5793,7 +5831,22 @@ def import10XSparseMatrixHeaders(matrix_file): barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")] barcodes = map(lambda x: string.replace(x,'-1',''), barcodes) return barcodes - barcodes = import10XSparseMatrixHeaders(sparse_matrix_file) + def importH5(h5_filename): + import h5py + f = h5py.File(h5_filename, 'r') + possible_genomes = f.keys() + if len(possible_genomes) != 1: + raise Exception("{} contains multiple genomes ({}). Explicitly select one".format(h5_filename, ", ".join(possible_genomes))) + genome = possible_genomes[0] + barcodes = f[genome]['barcodes'] + barcodes = map(lambda x: string.replace(x,'-1',''), barcodes) + print len(barcodes);sys.exit() + return barcodes + + if '.mtx' in sparse_matrix_file: + barcodes = import10XSparseMatrixHeaders(sparse_matrix_file) + else: + barcodes = importH5(sparse_matrix_file) cel_files = barcodes else: cel_files,cel_files_fn=identifyCELfiles(cel_file_dir,array_type,vendor) @@ -6507,16 +6560,20 @@ def import10XSparseMatrixHeaders(matrix_file): batch = ''; batch_name = '' agd = ArrayGroupData(cel_file,batch,batch_name); array_batch_list.append(agd); batch_db=[] if comps_name in dir_files and len(group_db)>0: - comp_group_list, null = ExpressionBuilder.importComparisonGroups(comps_file_dir) - for group1,group2 in comp_group_list: - try: - group_name1 = group_db[int(group1)]; group_name2 = group_db[int(group2)] - original_comp_group_list.append((group_name1,group_name2)) ### If comparisons already exist, default to these - except KeyError: - print_out = 'The "comps." file for this dataset has group numbers\nnot listed in the "groups." file.' - #WarningWindow(print_out,'Exit'); AltAnalyze.AltAnalyzeSetup('no'); sys.exit() - #print print_out + try: + comp_group_list, null = ExpressionBuilder.importComparisonGroups(comps_file_dir) + for group1,group2 in comp_group_list: + try: + group_name1 = group_db[int(group1)]; group_name2 = group_db[int(group2)] + original_comp_group_list.append((group_name1,group_name2)) ### If comparisons already exist, default to these + except KeyError: + print_out = 'The "comps." file for this dataset has group numbers\nnot listed in the "groups." file.' + #WarningWindow(print_out,'Exit'); AltAnalyze.AltAnalyzeSetup('no'); sys.exit() + #print print_out original_comp_group_list=[] + except: + print_out = 'The "comps." file for this dataset has group numbers\nnot listed in the "groups." file.' + original_comp_group_list=[] else: for cel_file in cel_files: @@ -7017,7 +7074,7 @@ def Normalize(self): return self._Normalize def setDownsample(self,downsample): self.downsample = downsample def DownSample(self): try: - return int(self.downsample()) + return int(self.downsample) except: return 2500 def setK(self,k): self.k = k diff --git a/import_scripts/ChromiumProcessing.py b/import_scripts/ChromiumProcessing.py index e44ee78..875c2b1 100755 --- a/import_scripts/ChromiumProcessing.py +++ b/import_scripts/ChromiumProcessing.py @@ -5,19 +5,45 @@ import numpy import time import math +from scipy import sparse, stats +import h5py def import10XSparseMatrix(matrices_dir,genome,dataset_name, expFile=None, log=True): start_time = time.time() - human_matrix_dir = os.path.join(matrices_dir, genome) - mat = scipy.io.mmread(os.path.join(human_matrix_dir, "matrix.mtx")) - genes_path = os.path.join(human_matrix_dir, "genes.tsv") - if os.path.isfile(genes_path)==False: - genes_path = os.path.join(human_matrix_dir, "features.tsv") - gene_ids = [row[0] for row in csv.reader(open(genes_path), delimiter="\t")] - gene_names = [row[1] for row in csv.reader(open(genes_path), delimiter="\t")] - barcodes_path = os.path.join(human_matrix_dir, "barcodes.tsv") - barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")] - barcodes = map(lambda x: string.replace(x,'-1',''), barcodes) + h5_filename = matrices_dir + if '.h5' in matrices_dir: + f = h5py.File(h5_filename, 'r') + genome = None + matrices_dir = os.path.abspath(os.path.join(h5_filename, os.pardir)) + if genome == None: + possible_genomes = f.keys() + if len(possible_genomes) != 1: + raise Exception("{} contains multiple genomes ({}). Explicitly select one".format(h5_filename, ", ".join(possible_genomes))) + genome = possible_genomes[0] + #print("Auto-selecting genome {}".format(genome), file=sys.stderr) + + mat = sparse.csc_matrix((f[genome]['data'], f[genome]['indices'], f[genome]['indptr'])) + gene_names = f[genome]['gene_names'] + barcodes = list(f[genome]['barcodes']) + gene_ids = list(f[genome]['genes']) + else: + #matrix_dir = os.path.join(matrices_dir, genome) + matrix_dir = matrices_dir + mat = scipy.io.mmread(matrix_dir) + genes_path = string.replace(matrix_dir,'matrix.mtx','genes.tsv') + barcodes_path = string.replace(matrix_dir,'matrix.mtx','barcodes.tsv') + if os.path.isfile(genes_path)==False: + genes_path = string.replace(matrix_dir,'matrix.mtx','features.tsv') + if '.gz' in genes_path: + gene_ids = [row[0] for row in csv.reader(gzip.open(genes_path), delimiter="\t")] + gene_names = [row[1] for row in csv.reader(gzip.open(genes_path), delimiter="\t")] + barcodes = [row[0] for row in csv.reader(gzip.open(barcodes_path), delimiter="\t")] + else: + gene_ids = [row[0] for row in csv.reader(open(genes_path), delimiter="\t")] + gene_names = [row[1] for row in csv.reader(open(genes_path), delimiter="\t")] + barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")] + barcodes = map(lambda x: string.replace(x,'-1',''), barcodes) + matrices_dir = os.path.abspath(os.path.join(matrices_dir, os.pardir)) ### Write out raw data matrix counts_path = matrices_dir+'/'+dataset_name+'_matrix.txt' diff --git a/import_scripts/TableToMatrix.py b/import_scripts/TableToMatrix.py new file mode 100644 index 0000000..b134e53 --- /dev/null +++ b/import_scripts/TableToMatrix.py @@ -0,0 +1,112 @@ +import os, sys, string +from scipy import sparse, io +import numpy +import math +import time + +""" Converts a tab-delimited text file to a sparse matrix """ + +class SparseMatrix: + def __init__(self,barcodes,features,data_matrix): + self.barcodes = barcodes + self.features = features + self.data_matrix = data_matrix + def Barcodes(self): return self.barcodes + def Features(self): return self.features + def Matrix(self): return self.data_matrix + +def import_filter_genes(fn): + gene_filter = [] + for line in open(fn,'rU').xreadlines(): + if '\t' in line: + gene = string.split(line.rstrip(),'\t')[0] + else: + gene = string.split(line.rstrip(),',')[0] + gene_filter.append(gene) + return gene_filter + +def covert_table_to_matrix(fn,delimiter='\t',gene_filter=None,Export=True): + header=True + skip=False + start_time = time.time() + for line in open(fn,'rU').xreadlines(): + if header: + delimiter = ',' # CSV file + start = 1 + if 'row_clusters' in line: + start=2 # An extra column and row are present from the ICGS file + skip=True + if '\t' in line: + delimiter = '\t' # TSV file + + t = string.split(line.rstrip(),delimiter)[start:] + + """ Optionally write out the matrix """ + if Export: + export_directory = os.path.abspath(os.path.join(fn, os.pardir))+'/sparse/' + try: os.mkdir(export_directory) + except: pass + barcodes_export = open(export_directory+'barcodes.tsv', 'w') + features_export = open(export_directory+'features.tsv', 'w') + matrix_export = open(export_directory+'matrix.mtx', 'w') + barcodes = t + barcodes_export.write(string.join(t,'\n')) + barcodes_export.close() + genes=[] + data_array=[] + header=False + elif skip: + skip=False # Igore the second row in the file that has cluster info + else: + values = string.split(line.rstrip(),'\t') + gene = values[0] + if gene_filter!=None: + """ Exclude the gene from the large input matrix if not in the filter list """ + if gene not in gene_filter: + continue + if ' ' in gene: + gene = string.split(gene,' ')[0] + if ':' in gene: + genes.append((gene.rstrip().split(':'))[1]) + else: + genes.append(gene) + """ If the data is a float, increment by 0.5 to round up """ + values = map(float,values[start:]) + def convert(x): + if x==0: + return 0 + else: + return int(math.pow(2,x)-1) + values = map(lambda x: convert(x),values) + data_array.append(values) + + data_array = sparse.csr_matrix(numpy.array(data_array)) + + end_time = time.time() + print 'Sparse matrix conversion in',end_time-start_time,'seconds.' + + if Export: + features_export.write(string.join(genes,'\n')) + features_export.close() + io.mmwrite(matrix_export, data_array) + else: + sm = SparseMatrix(barcodes,genes,data_array) + return sm + +if __name__ == '__main__': + import getopt + + gene_filter = None + ################ Comand-line arguments ################ + if len(sys.argv[1:])<=1: ### Indicates that there are insufficient number of command-line arguments + sys.exit() + else: + options, remainder = getopt.getopt(sys.argv[1:],'', ['i=','f=']) + for opt, arg in options: + if opt == '--i': fn=arg + elif opt == '--f': gene_filter=arg + else: + print "Warning! Command-line argument: %s not recognized. Exiting..." % opt; sys.exit() + if gene_filter != None: + gene_filter = import_filter_genes(gene_filter) + covert_table_to_matrix(fn,gene_filter=gene_filter) \ No newline at end of file diff --git a/import_scripts/combineCSV.py b/import_scripts/combineCSV.py new file mode 100644 index 0000000..fa4b6df --- /dev/null +++ b/import_scripts/combineCSV.py @@ -0,0 +1,77 @@ +import sys,string,os,math +sys.path.insert(1, os.path.join(sys.path[0], '..')) ### import parent dir dependencies + +def getFiles(sub_dir): + dir_list = os.listdir(sub_dir); dir_list2 = [] + for entry in dir_list: + if '.csv' in entry: dir_list2.append(entry) + return dir_list2 + +def cleanUpLine(line): + line = string.replace(line,'\n','') + line = string.replace(line,'\c','') + data = string.replace(line,'\r','') + data = string.replace(data,'"','') + return data + +def combineCSVFiles(root_dir): + files = getFiles(root_dir) + import export + folder = export.getParentDir(root_dir+'/blah.txt') + first_file = True + genes=[] + cells=[] + data_matrices=[] + for file in files: + cells.append(file[:-4]) + matrix=[] + for line in open(root_dir+'/'+file,'rU').xreadlines(): + data = cleanUpLine(line) + gene,count = string.split(data,',') + if first_file: + genes.append(gene) + matrix.append(float(count)) + data_matrices.append(matrix) + first_file=False + + data_matrices = zip(*data_matrices) + export_object = open(root_dir+'/'+folder+'-counts.txt','w') + headers = string.join(['UID']+cells,'\t')+'\n' + export_object.write(headers) + index=0 + for geneID in genes: + values = string.join([geneID]+map(str,list(data_matrices[index])),'\t')+'\n' + export_object.write(values) + index+=1 + export_object.close() + + data_matrices = zip(*data_matrices) + + index=0 + for cell in cells: + matrix = data_matrices[index] + barcode_sum = sum(matrix) + data_matrices[index] = map(lambda val: math.log((10000.00*val/barcode_sum)+1.0,2), matrix) + index+=1 + + data_matrices = zip(*data_matrices) + + export_object = open(root_dir+'/'+folder+'-CPTT.txt','w') + headers = string.join(['UID']+cells,'\t')+'\n' + export_object.write(headers) + index=0 + for geneID in genes: + values = string.join([geneID]+map(str,list(data_matrices[index])),'\t')+'\n' + export_object.write(values) + index+=1 + export_object.close() + +if __name__ == '__main__': + ################ Comand-line arguments ################ + import getopt + options, remainder = getopt.getopt(sys.argv[1:],'', ['i=','GeneType=', 'IDType=']) + for opt, arg in options: + if opt == '--i': input_dir=arg + + combineCSVFiles(input_dir) + diff --git a/import_scripts/mergeFiles.py b/import_scripts/mergeFiles.py index 350752d..41fb809 100755 --- a/import_scripts/mergeFiles.py +++ b/import_scripts/mergeFiles.py @@ -286,6 +286,7 @@ def joinFiles(files_to_merge,CombineType,unique_join,outputDir): output_dir = outputDir combine_type = string.lower(CombineType) permform_all_pairwise = 'yes' + permform_all_pairwise = 'no' ### Uses only one reference gene ID print 'combine type:',combine_type print 'join type:', unique_join diff --git a/import_scripts/sampleIndexSelection.py b/import_scripts/sampleIndexSelection.py index 83990dd..b273cc9 100755 --- a/import_scripts/sampleIndexSelection.py +++ b/import_scripts/sampleIndexSelection.py @@ -38,7 +38,8 @@ def filterFile(input_file,output_file,filter_names,force=False,calculateCentroid for f in filter_names: if f in values: filter_names2.append(f) filter_names = filter_names2 - try: sample_index_list = map(lambda x: values.index(x), filter_names) + try: + sample_index_list = map(lambda x: values.index(x), filter_names) except: ### If ":" in header name if ':' in line: @@ -49,11 +50,24 @@ def filterFile(input_file,output_file,filter_names,force=False,calculateCentroid values2.append(x) values = values2 sample_index_list = map(lambda x: values.index(x), filter_names) - elif ':' in filter_names: - filter_names = map(lambda x: string.split(filter_names,':')[1], filter_names) + elif '.$' in line: + filter_names2=[] + for f in filter_names: ### if the name in the filter is a string within the input data-file + for f1 in values: + if f in f1: + filter_names2.append(f1) ### change to the reference name + break + print len(filter_names2), len(values), len(filter_names);kill + filter_names = filter_names2 + #filter_names = map(lambda x: string.split(x,'.')[0], filter_names) + #values = map(lambda x: string.split(x,'.')[0], values) + sample_index_list = map(lambda x: values.index(x), filter_names) else: + temp_count=1 for x in filter_names: if x not in values: + temp_count+=1 + if temp_count>50: print 'too many to print';kill print x, print 'are missing';kill @@ -222,7 +236,9 @@ def cleanUpLine(line): data = string.replace(line,'\r','') data = string.replace(data,'"','') #https://stackoverflow.com/questions/36598136/remove-all-hex-characters-from-string-in-python - data = data.decode('utf8').encode('ascii', errors='ignore') ### get rid of bad quotes + try: data = data.decode('utf8').encode('ascii', errors='ignore') ### get rid of bad quotes + except: + print data return data def statisticallyFilterFile(input_file,output_file,threshold,minGeneCutoff=499): diff --git a/markerFinder.py b/markerFinder.py index 08a6a42..919d22a 100755 --- a/markerFinder.py +++ b/markerFinder.py @@ -158,7 +158,7 @@ def rhoCalculation(data_list1,tissue_template): kill except Exception: rho = pearson(data_list1,tissue_template) - return rho + return rho, 'Null' def simpleScipyPearson(query_lists,reference_list): """ Get the top correlated values referenced by index of the query_lists (e.g., data matrix) """ @@ -859,6 +859,10 @@ def identifyMarkers(filename,cluster_comps,binarize=False): if tissue not in tissue_list: tissue_list.append(tissue) ### Keep track of the tissue order for ((rho,p),(probeset,symbol)) in tissue_scores[tissue]: + if correlateAllGenes: + try: all_genes_ranked[probeset,symbol].append([(rho,p),tissue]) + except Exception:all_genes_ranked[probeset,symbol] = [[(rho,p),tissue]] + ### Get a matrix of all genes to correlations try: gene_specific_rho_values[symbol].append(rho) except Exception: gene_specific_rho_values[symbol] = [rho] @@ -877,18 +881,13 @@ def identifyMarkers(filename,cluster_comps,binarize=False): except Exception: tissue_specific_IDs[probeset] = [tissue] try: interim_correlations[tissue].append([probeset,symbol,(rho,p)]) except Exception: interim_correlations[tissue] = [[probeset,symbol,(rho,p)]] - if correlateAllGenes: - for tissue in tissue_scores: - for ((rho,p),(probeset,symbol)) in tissue_scores[tissue]: - try: all_genes_ranked[probeset,symbol].append([(rho,p),tissue]) - except Exception:all_genes_ranked[probeset,symbol] = [[(rho,p),tissue]] - """ - - for ID in all_genes_ranked: - ag = all_genes_ranked[ID] - ag.sort() - all_genes_ranked[ID] = ag[-1] ### topcorrelated - """ + + if correlateAllGenes: ### This was commented out - Not sure why - get an error downstream otherwise + for ID in all_genes_ranked: + ag = all_genes_ranked[ID] + ag.sort() + all_genes_ranked[ID] = ag[-1] ### topcorrelated + #""" data = export.ExportFile(string.replace(filename[:-4]+'-all-correlations.txt','exp.','MarkerFinder.')) data.write(string.join(tissue_list,'\t')+'\n') @@ -940,11 +939,13 @@ def exportAllGeneCorrelations(filename,allGenesRanked): data.write(title_row+'\n') rho_sorted=[] for (probeset,symbol) in allGenesRanked: - try: (rho,p),tissue = allGenesRanked[(probeset,symbol)] + try: + (rho,p),tissue = allGenesRanked[(probeset,symbol)] except Exception: ### Applies to tiered analysis allGenesRanked[(probeset,symbol)].sort() (rho,p),tissue = allGenesRanked[(probeset,symbol)][-1] + values = string.join([probeset,symbol,str(rho),str(p),tissue],'\t')+'\n' rho_sorted.append([(tissue,1.0/rho),values]) rho_sorted.sort() diff --git a/stats_scripts/ICGS_NMF.py b/stats_scripts/ICGS_NMF.py index 811eff0..b0160fd 100644 --- a/stats_scripts/ICGS_NMF.py +++ b/stats_scripts/ICGS_NMF.py @@ -206,7 +206,7 @@ def hgvfinder(inputfile): counter+=1 #if counter%500==0: print counter, - # break + # break #with open('hgv_0.1.txt', 'w') as f: # for item in hgv: # f.write(str(item)+"\t"+str(hgv[item])) @@ -269,32 +269,32 @@ def UMAPsampling(inputfile): sampmark=[] nn=X.shape[0] nm=X.shape[1] + """ try: import umap - except: - from visualization_scripts.umap_learn import umap + except: from visualization_scripts.umap_learn import umap print "running UMAP" model=umap.UMAP() - + """ from annoy import AnnoyIndex t=AnnoyIndex(nm) for i in range(nn): - try: t.add_item(i,X[i]) - except Exception: print i + try: t.add_item(i,X[i]) + except Exception: print i t.build(100) - # t.save('test.ann') + #t.save('test.ann') #u=AnnoyIndex(nm) diclst={} #u.load('test.ann') #n1=25 print "creating graphs" for i in range(nn): - #ind = tree.query([Xtemp[i]],k=10,return_distance=False,dualtree=True) + #ind = tree.query([Xtemp[i]],k=10,return_distance=False,dualtree=True) ind=t.get_nns_by_item(i,10) diclst[i]=ind print "creating graphs" G=nx.from_dict_of_lists(diclst) - # nx.write_adjlist(G,"test.adjlist") + #nx.write_adjlist(G,"test.adjlist") #G=nx.read_adjlist("test.adjlist") dendrogram= community.generate_dendrogram(G) #for level in range(len(dendrogram) - 1): @@ -318,7 +318,7 @@ def UMAPsampling(inputfile): matri=np.array(comval[key1]) matri=np.array(matri) - #n=matri.shape[0] + #n=matri.shape[0] D=pairwise_distances(matri,metric='euclidean').tolist() D=np.array(D) @@ -476,13 +476,12 @@ def PageRankSampling(inputfile,downsample_cutoff): samptemp.append(header[i]) sampmark=samptemp - if len(sampmark)>3000 and n<5000: + if len(sampmark)>downsample_cutoff: output_file=inputfile[:-4]+'-filtered.txt' sampleIndexSelection.filterFile(inputfile,output_file,sampmark) sampmark=sampling(output_file) return sampmark else: - return sampmark @@ -951,7 +950,7 @@ def generateMarkerheatmap(processedInputExpFile,output_file,NMFSVM_centroid_clus def callICGS(processedInputExpFile,species,rho_cutoff,dynamicCorrelation,platform,gsp): #Run ICGS recursively to dynamically identify the best rho cutoff - graphic_links3,n = RNASeq.singleCellRNASeqWorkflow(species,platform,processedInputExpFile,mlp,exp_threshold=0, rpkm_threshold=0, parameters=gsp) + graphic_links3,n = RNASeq.singleCellRNASeqWorkflow(species,platform,processedInputExpFile,mlp,rpkm_threshold=0, parameters=gsp) if n>5000 and dynamicCorrelation: rho_cutoff=rho_cutoff+0.1 gsp.setRhoCutoff(rho_cutoff) @@ -1316,8 +1315,12 @@ def exportGroups(cluster_file,outdir,platform): def runICGS_NMF(inputExpFile,scaling,platform,species,gsp,enrichmentInput='',dynamicCorrelation=True): """ Export the filtered expression file then run downsampling analysis and prepares files for ICGS. After running ICGS, peform enrichment analyses """ + try: downsample_cutoff = gsp.DownSample() + except: downsample_cutoff = 2500 + print 'DownSample threshold =',downsample_cutoff, 'cells' + print 'Filtering the expression dataset (be patient).', - print_out, inputExpFile = RNASeq.singleCellRNASeqWorkflow(species,platform,inputExpFile,mlp,exp_threshold=0,rpkm_threshold=0,parameters=gsp,reportOnly=True) + print_out, inputExpFile = RNASeq.singleCellRNASeqWorkflow(species,platform,inputExpFile,mlp,rpkm_threshold=0,parameters=gsp,reportOnly=True) print 'Running ICGS-NMF' ### Find the parent dir of the output directory (expression file from the GUI will be stored in the output dir [ExpressionInput]) @@ -1334,10 +1337,7 @@ def runICGS_NMF(inputExpFile,scaling,platform,species,gsp,enrichmentInput='',dyn ### Use dispersion (variance by mean) to define initial variable genes inputExpFileVariableGenesDir,n=hgvfinder(inputExpFile) ### returns filtered expression file with 500 variable genes - - try: downsample_cutoff = gsp.DownSample() - except: downsample_cutoff = 2500 - + if n>downsample_cutoff and scaling: if n>25000: ### For extreemly large datasets, UMAP is used as a preliminary downsampling before pagerank inputExpFileScaled=inputExpFile[:-4]+'-UMAP-downsampled.txt' diff --git a/stats_scripts/cellHarmony.py b/stats_scripts/cellHarmony.py new file mode 100644 index 0000000..116c68a --- /dev/null +++ b/stats_scripts/cellHarmony.py @@ -0,0 +1,79 @@ +import sys,string,os +sys.path.insert(1, os.path.join(sys.path[0], '..')) ### import parent dir dependencies +from scipy import sparse, io +import numpy +import LineageProfilerIterate +import cluster_corr + +""" cellHarmony with Louvain Clustering Funcitons """ + +def manage_louvain_alignment(species,platform,query_exp_file,exp_output, + customMarkers=False,useMulti=False,fl=None,customLabels=None): + + if 'ICGS' in customMarkers or 'MarkerGene' in customMarkers: + """ When performing cellHarmony, build an ICGS expression reference with log2 TPM values rather than fold """ + print 'Converting ICGS folds to ICGS expression values as a reference first...' + try: customMarkers = LineageProfilerIterate.convertICGSClustersToExpression(customMarkers,query_exp_file,returnCentroids=False,species=species) + except: + print "Using the supplied reference file only (not importing raw expression)...Proceeding without differential expression analsyes..." + pass + + gene_list = None + if species != None: + gene_list = cluster_corr.read_gene_list(customMarkers) + + export_directory = os.path.abspath(os.path.join(query_exp_file, os.pardir)) + dataset_name = string.replace(string.split(query_exp_file,'/')[-1][:-4],'exp.','') + try: os.mkdir(export_directory+'/CellClassification/') + except: pass + output_classification_file = export_directory+'/CellClassification/'+dataset_name+'-CellClassification.txt' + reference = customMarkers + query = query_exp_file + louvain_results = cluster_corr.find_nearest_cells(reference, + query_exp_file, + gene_list=gene_list, + num_neighbors=10, + num_trees=100, + louvain_level=0, + min_cluster_correlation=-1, + genome=species) + cluster_corr.write_results_to_file(louvain_results, output_classification_file, labels=customLabels) + + LineageProfilerIterate.harmonizeClassifiedSamples(species, reference, query_exp_file, output_classification_file,fl=fl) + + return True + +if __name__ == '__main__': + + from argparse import ArgumentParser + parser = ArgumentParser(description="find the cells in reference_h5 that are most similar to the cells in query_h5") + parser.add_argument("reference_h5", help="a CellRanger h5 file") + parser.add_argument("query_h5", help="a CellRanger h5 file") + parser.add_argument("output", help="the result file to write") + parser.add_argument("-g", "--genes", default=None, help="an ICGS file with the genes to use") + parser.add_argument("-s", "--genome", default=None, help="genome aligned to") + parser.add_argument("-k", "--num_neighbors", type=int, default=10, + help="number of nearest neighbors to use in clustering, default: %(default)s") + parser.add_argument("-t", "--num_trees", type=int, default=100, + help="number of trees to use in random forest for approximating nearest neighbors, default: %(default)s") + parser.add_argument("-l", "--louvain", type=int, default=0, + help="what level to cut the clustering dendrogram. 0 is the most granular, -1 the least. Default: %(default)s") + parser.add_argument("-m", "--min_correlation", type=float, default=-1, + help="the lowest correlation permissible between clusters. Any clusters in query that don't correlate to ref at least this well will be skipped. Default: %(default)s") + parser.add_argument("-d", "--diff_expression", type=float, default=True, + help="perform differential expression analyses. Default: %(default)s") + parser.add_argument("-b", "--labels", type=str, default=None, help = "a tab-delimited text file with two columns (reference cell barcode and cluster name)") + + args = parser.parse_args() + + genome = None + if args.genes != None: + genome = args.genome + + if args.labels != None: + labels = cluster_corr.read_labels_dictionary(args.labels) + + platform = 'RNASeq' + manage_louvain_alignment(genome,platform,args.query_h5,None, + customMarkers=args.reference_h5,useMulti=False,fl=None,customLabels=labels) + \ No newline at end of file diff --git a/stats_scripts/cell_collection.py b/stats_scripts/cell_collection.py new file mode 100644 index 0000000..35c9c0c --- /dev/null +++ b/stats_scripts/cell_collection.py @@ -0,0 +1,284 @@ +from __future__ import print_function + +import h5py +import scipy.io +from scipy import sparse, stats +import numpy as np +import sys, string, os, csv, math +sys.path.insert(1, os.path.join(sys.path[0], '..')) ### import parent dir dependencies + +def index_items(universe, itemset): + """ + Returns a list of indices to the items in universe that match items in itemset + """ + return [ idx for idx, item in enumerate(universe) if item in itemset ] + +class CellCollection: + """ + Encapsulates a cohort of cells, ie from a CellRanger run + + Expression values are stored in a sparse matrix, and barcodes/gene identifiers are + maintained in parallel arrays. Construct by calling CellCollection.from_cellranger_h5() + """ + + @staticmethod + def from_cellranger_h5(h5_filename, genome=None, returnGenes=False): + """ + Creates a CellCollection from the contents of an H5 file created by CellRanger. + + If genome is None, the only genome in the H5 will be loaded. If genome is None and multiple genomes + are in the H5, an Exception is raised. + """ + coll = CellCollection() + f = h5py.File(h5_filename, 'r') + if genome == None: + possible_genomes = f.keys() + if len(possible_genomes) != 1: + raise Exception("{} contains multiple genomes ({}). Explicitly select one".format(h5_filename, ", ".join(possible_genomes))) + genome = possible_genomes[0] + #print("Auto-selecting genome {}".format(genome), file=sys.stderr) + + coll._gene_names = f[genome]['gene_names'] + coll._gene_ids = f[genome]['genes'] + coll._barcodes = f[genome]['barcodes'] + coll._barcodes = map(lambda x: string.replace(x,'-1',''), coll._barcodes) + + if returnGenes: + """ Do not import the matrix at this point """ + return list(coll._gene_names) + + coll._matrix = sparse.csc_matrix((f[genome]['data'], f[genome]['indices'], f[genome]['indptr'])) + + print('sparse matrix data imported from h5 file...') + return coll + + @staticmethod + def from_cellranger_mtx(matrix_directory, genome=None, returnGenes=False): + """ + Creates a CellCollection from the contents of an mtx directory created by CellRanger. + """ + + coll = CellCollection() + + genes_path = os.path.join(matrix_directory, "genes.tsv") + if os.path.isfile(genes_path)==False: + genes_path = os.path.join(matrix_directory, "features.tsv") + gene_ids = [row[0] for row in csv.reader(open(genes_path), delimiter="\t")] + + try: + coll._gene_names = np.array([row[1] for row in csv.reader(open(genes_path), delimiter="\t")]) + except: + coll._gene_names = np.array([row[0] for row in csv.reader(open(genes_path), delimiter="\t")]) + if returnGenes: + """ Do not import the matrix at this point """ + return list(coll._gene_names) + + coll._gene_ids = coll._gene_names + barcodes_path = os.path.join(matrix_directory, "barcodes.tsv") + barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")] + coll._barcodes = np.array(map(lambda x: string.replace(x,'-1',''), barcodes)) + coll._matrix = scipy.io.mmread(os.path.join(matrix_directory, "matrix.mtx")) + + print('sparse matrix data imported from mtx file...') + return coll + + @staticmethod + def from_tsvfile(tsv_file, genome=None, returnGenes=False, gene_list=None): + """ + Creates a CellCollection from the contents of a tab-separated text file. + """ + + coll = CellCollection() + + UseDense=False + header=True + skip=False + for line in open(tsv_file,'rU').xreadlines(): + if header: + delimiter = ',' # CSV file + start = 1 + if 'row_clusters' in line: + start=2 # An extra column and row are present from the ICGS file + skip=True + if '\t' in line: + delimiter = '\t' # TSV file + + coll._barcodes=string.split(line.rstrip(),delimiter)[start:] + coll._gene_names=[] + data_array=[] + header=False + elif skip: + skip=False # Igore the second row in the file that has cluster info + else: + values = line.rstrip().split(delimiter) + gene = values[0] + if gene_list!=None: + if gene not in gene_list: + continue + if ' ' in gene: + gene = string.split(gene,' ')[0] + if ':' in gene: + coll._gene_names.append((gene.rstrip().split(':'))[1]) + else: + coll._gene_names.append(gene) + + """ If the data (always log2) is a float, increment by 0.5 to round up """ + if returnGenes==False: + if UseDense: + data_array.append(map(float,values[start:])) + else: + #data_array.append(map(lambda x: round(math.pow(2,float(x))),values[start:])) + data_array.append(map(float,values[start:])) + + if returnGenes: + """ Do not import the matrix at this point """ + return list(coll._gene_names) + + if UseDense: + coll._matrix = np.array(data_array) + else: + """ Convert to a sparse matrix """ + coll._matrix = sparse.csc_matrix(np.array(data_array)) + + coll._barcodes = np.array(coll._barcodes) + coll._gene_names = np.array(coll._gene_names) + coll._gene_ids = coll._gene_names + print('sparse matrix data imported from TSV file...') + return coll + + def __init__(self): + self._matrix = sparse.csc_matrix((0,0), dtype=np.int8) + self._barcodes = () + self._gene_names = () + self._gene_ids = () + + def __getattr__(self, name): + """ + Methods/attributes not explicitly defined in the CellCollection are passed down + to the matrix + """ + return getattr(self._matrix, name) + + def num_genes(self): + return len(self._gene_ids) + + def num_cells(self): + return len(self._barcodes) + + def get_barcode(self, cell_index): + return self._barcodes[cell_index] + + def get_cell_expression_vector(self, cell_index): + """ + Returns a (standard, non-sparse) sequence of expression values for a given cell + """ + try: + return self._matrix.getcol(cell_index).todense() + except: + return self._matrix[:,cell_index] # ith column for existing dense matrix + + def centroid(self): + """ + Returns the centroid of this collection as a (standard, non-sparse) sequence. + + The centroid is defined as the mean expression of each gene + """ + return self._matrix.mean(axis=1) + + def partition(self, partition_dict): + """ + Returns a dictionary of CellCollections, each a distinct subset (by cell) of self. + partition_dict is a dictionary of cell index => set id, as generated by + the python-louvain methods + """ + partitions = {} + for k, v in partition_dict.items(): + if v not in partitions: partitions[v] = [] + partitions[v].append(k) + result = {} + for part_id in partitions.keys(): + result[part_id] = self.subset_by_cell_index(partitions[part_id]) + return result + + def find_best_correlated(self, query): + """ + Identifies the cell in this collection that has the highest Pearson's correlation + with query (a sequence of expression values in the same order as in this collection) + + Returns the pair of (barcode, r^2 value) for the best match in ref + """ + best_cor = -2 + best_bc = "" + for idx in range(self.num_cells()): + r = self.get_cell_expression_vector(idx) + cor = stats.pearsonr(query, r)[0][0] # pearsonr returns the pair (r^2, p-val), and for some reason the r^2 is a list + if cor > best_cor: + best_cor = cor + best_bc = self.get_barcode(idx) + return best_bc, best_cor + + def filter_by_cell_index(self, cell_index): + self._matrix = self._matrix[:, cell_index] + self._barcodes = self._barcodes[cell_index] + + def subset_by_cell_index(self, cell_index): + """ + Returns a new CellCollection containing only chosen cells from self + """ + cc = CellCollection() + cc._gene_ids = self._gene_ids + cc._gene_names = self._gene_names + cc._matrix = self._matrix[:, cell_index] + cc._barcodes = self._barcodes[cell_index] + return cc + + def filter_barcodes(self, barcode_list): + """ + Reduces the CellCollection in-place to only contain the barcodes requested + """ + barcode_subset = set(barcode_list) + #print("Selecting {} barcodes".format(len(barcode_subset)), file=sys.stderr) + barcode_index = index_items(self._barcodes, barcode_subset) + self.filter_by_cell_index(barcode_index) + + def subset_barcodes(self, barcode_list): + barcode_subset = set(barcode_list) + barcode_index = index_items(self._barcodes, barcode_subset) + return self.subset_by_cell_index(barcode_index) + + def _filter_genes_by_index(self, gene_index): + #print(gene_index);sys.exit() + self._matrix = self._matrix[gene_index, :] + self._gene_ids = self._gene_ids[gene_index] + self._gene_names = self._gene_names[gene_index] + #mat_array_original = self._matrix.toarray() + #print(len(mat_array_original)) + + def filter_genes_by_symbol(self, symbol_list): + """ + Reduces the CellCollection in-place to only contain the genes requested. + + Note that gene symbols could be non-unique, and thus more genes may remain in the + filtered collection than were requested. The order of the genes in the h5 may also + differ and the same genes may not be present in the different sets + """ + gene_subset = set(symbol_list) + #print("Selecting {} genes".format(len(gene_subset)), file=sys.stderr) + #gene_index = index_items(self._gene_names, gene_subset) # will output genes in the full dataset order + gene_index=[] + gene_names = list(self._gene_names) + for gene in gene_subset: + if gene in gene_names: + gene_index.append(gene_names.index(gene)) + self._filter_genes_by_index(gene_index) + + def filter_genes_by_id(self, id_list): + """ + Reduces the CellCollection in-place to only contain the genes requested. + """ + gene_subset = set(id_list) + #print("Selecting {} genes".format(len(gene_subset)), file=sys.stderr) + gene_index = index_items(self._gene_ids, gene_subset) + self._filter_genes_by_index(gene_index) + + diff --git a/stats_scripts/cluster_corr.py b/stats_scripts/cluster_corr.py new file mode 100644 index 0000000..fa32cdb --- /dev/null +++ b/stats_scripts/cluster_corr.py @@ -0,0 +1,255 @@ +from __future__ import print_function + +from cell_collection import CellCollection +from annoy import AnnoyIndex +import community # python-louvain package, imported as community +import networkx +import sys,string,os +sys.path.insert(1, os.path.join(sys.path[0], '..')) # import parent dir dependencies +import numpy as np + +def read_gene_list(filename): + """ + Reads the gene list from a file + """ + gene_list = [] + with open(filename, 'rU') as f: + header_lines = 2 + for line in f: + if header_lines > 0: + header_lines -= 1 + else: + gene = line.split('\t', 1)[0] + if ' ' in gene: + gene = string.split(gene,' ')[0] + if ':' in gene: + gene_list.append((gene.rstrip().split(':'))[1]) + else: + gene_list.append(gene) + return gene_list + +def read_labels_dictionary(filename): + """ + Reads the labels assigned by the user to each barcode (first and last columns) + """ + label_dictionary = {} + with open(filename, 'rU') as f: + header_lines = 2 + for line in f: + barcode = line.split('\t', 1)[0] + label = line.split('\t', 1)[-1] + label_dictionary[barcode]=label.rstrip() + return label_dictionary + +def find_nearest_cells(ref_h5_filename, query_h5_filename, gene_list=None, genome=None, + num_neighbors=10, num_trees=100, louvain_level=-1, min_cluster_correlation=-1): + """ + For every cell in query_h5_filename, identifies the most similar cell in ref_h5_filename. + + For parameter definitions, see partition_h5_file() and find_closest_cluster(), this is a convenience + function that calls them (among others) + """ + + gene_list = find_shared_genes(ref_h5_filename,genome=genome,gene_list=gene_list) + gene_list = find_shared_genes(query_h5_filename,genome=genome,gene_list=gene_list) + + ref_partition = partition_h5_file(ref_h5_filename, gene_list=gene_list, num_neighbors=num_neighbors, + num_trees=num_trees, louvain_level=louvain_level,genome=genome) + query_partition = partition_h5_file(query_h5_filename, gene_list=gene_list, num_neighbors=num_neighbors, + num_trees=num_trees, louvain_level=louvain_level,genome=genome) + best_match = find_closest_cluster(query_partition, ref_partition, min_correlation=min_cluster_correlation) + result = {} + for query_part_id, ref_part_id in best_match: + ref = ref_partition[ref_part_id] + query = query_partition[query_part_id] + for idx in range(query.num_cells()): + q_barcode = query.get_barcode(idx) + best_bc, best_cor = ref.find_best_correlated(query.get_cell_expression_vector(idx)) + result[q_barcode] = {'barcode': best_bc, + 'correlation': best_cor, + 'query_partition': query_part_id, + 'ref_partition': ref_part_id} + return result + +def write_results_to_file(results, filename, labels=None): + def add_labels(barcode): + if barcode in labels: + return labels[barcode] + if ':' in barcode: + return labels[barcode.split(':')[1]] + else: + return 'NA' + + if labels == None: + with open(filename, 'w') as f: + print("\t".join( ("Query Barcode", "Ref Barcode", "Correlation", "Query Partition", "Ref Partition") ), file=f) + for q in results.keys(): + print("\t".join( (q, + results[q]['barcode'], + str(results[q]['correlation']), + str(results[q]['query_partition']), + str(results[q]['ref_partition'])) ), file=f) + else: + with open(filename, 'w') as f: + print("\t".join( ("Query Barcode", "Ref Barcode", "Correlation", "Query Partition", "Ref Partition", "Label") ), file=f) + for q in results.keys(): + print("\t".join( (q, + results[q]['barcode'], + str(results[q]['correlation']), + str(results[q]['query_partition']), + str(results[q]['ref_partition']), + add_labels(results[q]['barcode'])) ), file=f) + +def nearest_neighbors(collection, num_neighbors=10, n_trees=100): + """ + Finds the num_neighbors nearest neighbors to each cell in the sparse matrix + + Return result is a dictionary of lists, where the key is an index into the cells, + and the value is the neighbors of that cell + """ + nn_idx = AnnoyIndex(collection.num_genes()) + # Add the elements in reverse order because Annoy allocates the memory based on + # the value of the element added - so adding in increasing order will trigger + # lots of allocations + for i in range(collection.num_cells()-1, -1, -1): + nn_idx.add_item(i, collection.get_cell_expression_vector(i)) + nn_idx.build(n_trees) + return { i: nn_idx.get_nns_by_item(i, num_neighbors) for i in range(collection.num_cells()) } + +def identify_clusters(graph, louvain_level=-1): + """ + Identifies clusters in the given NetworkX Graph by Louvain partitioning. + + The parameter louvain_level controls the degree of partitioning. 0 is the most granular + partition, and granularity decreases as louvain_level increases. Since the number of + levels can't be known a priori, negative values "count down" from the max - ie, -1 + means to use the maximum possible value and thus get the largest clusters + """ + dendrogram = community.generate_dendrogram(graph) + if louvain_level < 0: + louvain_level = max(0, len(dendrogram) + louvain_level) + if louvain_level >= len(dendrogram): + #print("Warning [identify_clusters]: louvain_level set to {}, max allowable is {}. Resetting".format(louvain_level, len(dendrogram)-1), file=sys.stderr) + louvain_level = len(dendrogram) - 1 + #print("Cutting the Louvain dendrogram at level {}".format(louvain_level), file=sys.stderr) + return community.partition_at_level(dendrogram, louvain_level) + +def find_shared_genes(h5_filename,genome=None,gene_list=None): + """ + Selects genes shared by the reference, query and gene_list + """ + + if gene_list !=None: + if 'h5' in h5_filename: + genes = CellCollection.from_cellranger_h5(h5_filename,returnGenes=True) + elif 'txt' in h5_filename: + genes = CellCollection.from_tsvfile(h5_filename,genome,returnGenes=True,gene_list=gene_list) + else: + genes = CellCollection.from_cellranger_mtx(h5_filename,genome,returnGenes=True) + gene_list = list(set(genes) & set(gene_list)) + + return gene_list + +def partition_h5_file(h5_filename, gene_list=None, num_neighbors=10, num_trees=100, + louvain_level=-1,genome=None): + """ + Reads a CellRanger h5 file and partitions it by clustering on the k-nearest neighbor graph + + Keyword arguments: + gene_list - restricts the analysis to the specified list of gene symbols. Default is to not restrict + num_neighbors - the number of nearest neighbors to compute for each cell + num_trees - the number of trees used in the random forest that approximates the nearest neighbor calculation + louvain_level - the level of the Louvain clustering dendrogram to cut at. Level 0 is the lowest (most granular) + level, and higher levels get less granular. The highest level is considered the "best" set + of clusters, but the number of levels is not known a priori. Hence, negative values will + count down from the highest level, so -1 will always be the "best" clustering, regardless of + the actual number of levels in the dendrogram + + Return Result: A dictionary, where the keys are partition ids and the values are the CellCollection for that partition + """ + if 'h5' in h5_filename: + collection = CellCollection.from_cellranger_h5(h5_filename) + elif 'txt' in h5_filename: + collection = CellCollection.from_tsvfile(h5_filename,genome) + else: + collection = CellCollection.from_cellranger_mtx(h5_filename,genome) + if gene_list != None: + collection.filter_genes_by_symbol(gene_list) + neighbor_dict = nearest_neighbors(collection, num_neighbors=num_neighbors, n_trees=num_trees) + cluster_definition = identify_clusters(networkx.from_dict_of_lists(neighbor_dict), louvain_level=louvain_level) + return collection.partition(cluster_definition) + +def compute_centroids(collection_dict): + """Returns (centroid matrix, ID list) for the given dictionary of CellCollections""" + centroids = np.concatenate( [ p.centroid() for p in collection_dict.values() ], axis=1 ) + return centroids, collection_dict.keys() + +def find_closest_cluster(query, ref, min_correlation=-1): + """ + For each collection in query, identifies the collection in ref that is most similar + + query and ref are both dictionaries of CellCollections, keyed by a "partition id" + + Returns a list containing the best matches for each collection in query that meet the + min_correlation threshold. Each member of the list is itself a list containing the + id of the query collection and the id of its best match in ref + """ + query_centroids, query_ids = compute_centroids(query) + ref_centroids, ref_ids = compute_centroids(ref) + #print(len(ref_centroids),len(query_centroids)) + all_correlations = np.corrcoef(np.concatenate((ref_centroids, query_centroids), axis=1), rowvar=False) + + # At this point, we have the correlations of everything vs everything. We only care about query vs ref + # Extract the top-right corner of the matrix + nref = len(ref) + corr = np.hsplit(np.vsplit(all_correlations, (nref, ))[0], (nref,))[1] + best_match = zip(range(corr.shape[1]), np.argmax(corr, 0)) + # At this point, best_match is: 1) using indices into the array rather than ids, + # and 2) not restricted by the threshold. Fix before returning + return ( (query_ids[q], ref_ids[r]) for q, r in best_match if corr[r,q] >= min_correlation ) + +if __name__ == "__main__": + # genes = read_gene_list('data/FinalMarkerHeatmap_all.txt') + # results = find_nearest_cells('data/reference-filtered_gene_bc_matrices_h5.h5', + # 'data/query-filtered_gene_bc_matrices_h5.h5', + # gene_list=genes, louvain_level=-1) + # write_results_to_file(results, 'temp.txt') + + from argparse import ArgumentParser + parser = ArgumentParser(description="find the cells in reference_h5 that are most similar to the cells in query_h5") + parser.add_argument("reference_h5", help="a CellRanger h5 file") + parser.add_argument("query_h5", help="a CellRanger h5 file") + parser.add_argument("output", help="the result file to write") + parser.add_argument("-g", "--genes", default=None, help="an ICGS file with the genes to use") + parser.add_argument("-s", "--genome", default=None, help="genome aligned to") + parser.add_argument("-k", "--num_neighbors", type=int, default=10, + help="number of nearest neighbors to use in clustering, default: %(default)s") + parser.add_argument("-t", "--num_trees", type=int, default=100, + help="number of trees to use in random forest for approximating nearest neighbors, default: %(default)s") + parser.add_argument("-l", "--louvain", type=int, default=0, + help="what level to cut the clustering dendrogram. 0 is the most granular, -1 the least. Default: %(default)s") + parser.add_argument("-m", "--min_correlation", type=float, default=-1, + help="the lowest correlation permissible between clusters. Any clusters in query that don't correlate to ref at least this well will be skipped. Default: %(default)s") + parser.add_argument("-b", "--labels", type=str, default=None, help = "a tab-delimited text file with two columns (reference cell barcode and cluster name)") + + args = parser.parse_args() + + gene_list = None + genome = None + labels = None + if args.genes != None: + gene_list = read_gene_list(args.genes) + if args.labels != None: + labels = read_labels_dictionary(args.labels) + if args.genome != None: + genome = args.genome + + results = find_nearest_cells(args.reference_h5, + args.query_h5, + gene_list=gene_list, + num_neighbors=args.num_neighbors, + num_trees=args.num_trees, + louvain_level=args.louvain, + min_cluster_correlation=args.min_correlation, + genome=genome) + write_results_to_file(results, args.output,labels=labels) \ No newline at end of file diff --git a/stats_scripts/metaDataAnalysis.py b/stats_scripts/metaDataAnalysis.py index 93212a9..3d41a10 100755 --- a/stats_scripts/metaDataAnalysis.py +++ b/stats_scripts/metaDataAnalysis.py @@ -285,7 +285,8 @@ def RelativeInclusion(self,dPSI): else: return junction_type -def performDifferentialExpressionAnalysis(species,platform,input_file,groups_db,comps_db,CovariateQuery,splicingEventTypes={}): +def performDifferentialExpressionAnalysis(species,platform,input_file,groups_db,comps_db, + CovariateQuery,splicingEventTypes={},suppressPrintOuts=False): ### filter the expression file for the samples of interest and immediately calculate comparison statistics firstLine = True @@ -422,7 +423,8 @@ def performDifferentialExpressionAnalysis(species,platform,input_file,groups_db, header_samples = (tuple(g1_headers),tuple(g2_headers)) if header_samples not in headers_compared: headers_compared[header_samples]=[] - print len(g1_headers),len(g2_headers),groups + if suppressPrintOuts==False: + print len(g1_headers),len(g2_headers),groups try: data_list1 = group_expression_values[group1] except KeyError: @@ -445,6 +447,7 @@ def performDifferentialExpressionAnalysis(species,platform,input_file,groups_db, pass else: continue ### Don't analyze + #print len(g1_headers), len(g2_headers);sys.exit() if (len(data_list1)>1 and len(data_list2)>1) or (len(g1_headers)==1 or len(g2_headers)==1): ### For splicing data ### Just a One-Way ANOVA at first - moderation happens later !!!! try: @@ -1404,7 +1407,10 @@ def exportGeneSetsFromCombined(filename): aro.close() def remoteAnalysis(species,expression_file,groups_file,platform='PSI',log_fold_cutoff=0.1, - use_adjusted_pval=True,pvalThreshold=0.05,use_custom_output_dir=''): + use_adjusted_pval=True,pvalThreshold=0.05,use_custom_output_dir='', + suppressPrintOuts=False): + + print "Performing a differential expression analysis (be patient)..." global pval_threshold global PercentExp global restricted_gene_denominator @@ -1470,7 +1476,9 @@ def remoteAnalysis(species,expression_file,groups_file,platform='PSI',log_fold_c for i in all_groups_db: #print i - for k in all_groups_db[i]: print ' ',k,'\t',len(all_groups_db[i][k]) + for k in all_groups_db[i]: + if suppressPrintOuts == False: + print ' ',k,'\t',len(all_groups_db[i][k]) #print all_comps_db if platform == 'PSI': @@ -1489,7 +1497,7 @@ def remoteAnalysis(species,expression_file,groups_file,platform='PSI',log_fold_c comps_db = all_comps_db[specificCovariate] groups_db = all_groups_db[specificCovariate] rootdir,splicingEventTypes = performDifferentialExpressionAnalysis(species,platform,expression_file, - groups_db,comps_db,CovariateQuery,splicingEventTypes) + groups_db,comps_db,CovariateQuery,splicingEventTypes,suppressPrintOuts=suppressPrintOuts) if platform == 'PSI': graphics = outputSplicingSummaries(rootdir+'/'+CovariateQuery,splicingEventTypes) diff --git a/stats_scripts/removeDoublets.py b/stats_scripts/removeDoublets.py index 5a1d48b..72faddf 100755 --- a/stats_scripts/removeDoublets.py +++ b/stats_scripts/removeDoublets.py @@ -16,7 +16,7 @@ #OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE #SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -import sys,string,os,copy, path +import sys,string,os,copy, numpy, math sys.path.insert(1, os.path.join(sys.path[0], '..')) ### import parent dir dependencies command_args = string.join(sys.argv,' ') @@ -27,9 +27,11 @@ import traceback +from visualization_scripts import clustering + def removeMarkerFinderDoublets(heatmap_file,diff=1): - matrix, column_header, row_header, dataset_name, group_db, priorColumnClusters, priorRowClusters = remoteImportData(heatmap_file) - + matrix, column_header, row_header, dataset_name, group_db, priorColumnClusters, priorRowClusters = clustering.remoteImportData(heatmap_file) + priorRowClusters.reverse() if len(priorColumnClusters)==0: for c in column_header: @@ -83,9 +85,12 @@ def removeMarkerFinderDoublets(heatmap_file,diff=1): cluster_cell_means = numpy.median(cell_max_score_db[cluster],axis=0) cell_max_score_db[cluster] = cluster_cell_means ### This is the cell-state mean score for all cells in that cluster and the alternative max mean score (difference gives you the threshold for detecting double) i=0 - print len(cell_max_scores) + #print len(cell_max_scores) keep=['row_clusters-flat'] keep_alt=['row_clusters-flat'] + remove = ['row_clusters-flat'] + remove_alt = ['row_clusters-flat'] + min_val = 1000 for (cell_score,alt_score,alt_sum) in cell_max_scores: cluster = priorColumnClusters[i] cell = column_header[i] @@ -95,39 +100,45 @@ def removeMarkerFinderDoublets(heatmap_file,diff=1): ref_alt = math.pow(2,(ref_alt)) cell_diff = math.pow(2,(cell_score-alt_score)) cell_score = math.pow(2,cell_score) + if cell_diffref_diff and cell_diff>diff: #cell_score cutoff removes some, but cell_diff is more crucial #if alt_sum0: ### cmap_c is too few colors - cmap_c = pylab.cm.gist_rainbow + cmap_c = pylab.cm.nipy_spectral dc = numpy.array(ind2, dtype=int) dc.shape = (1,len(ind2)) @@ -966,7 +983,7 @@ def onpick1(event): cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF', '#CCCCE0','#000066','#FFFF00', '#FF1493']) #cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF','#FFFF00', '#FF1493']) if use_default_colors: - cmap_c = pylab.cm.gist_rainbow + cmap_c = pylab.cm.nipy_spectral else: if len(unique.unique(ind2_clust))==2: ### cmap_c is too few colors #cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF']) @@ -984,7 +1001,7 @@ def onpick1(event): elif len(unique.unique(ind2_clust))==7: ### cmap_c is too few colors cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#63C6BB', '#29C3EC', '#3D3181', '#7B4976','#FEBC18', '#EE2C3C']) elif len(unique.unique(ind2_clust))>0: ### cmap_c is too few colors - cmap_c = pylab.cm.gist_rainbow + cmap_c = pylab.cm.nipy_spectral dc = numpy.array(ind2_clust, dtype=int) dc.shape = (1,len(ind2_clust)) im_cd = axcd.matshow(dc, aspect='auto', origin='lower', cmap=cmap_c) @@ -1024,7 +1041,7 @@ def onpick1(event): #Eryth Gfi1 Gran HSCP-1 HSCP-2 IG2 MDP Meg Mono Multi-Lin Myelo #cmap_d = matplotlib.colors.ListedColormap(['#DC2342', 'k', '#0B9B48', '#FDDF5E', '#E0B724', 'w', '#5D82C1', '#F79020', '#4CB1E4', '#983894', '#71C065']) elif len(unique.unique(ind2))>0: ### cmap_c is too few colors - cmap_d = pylab.cm.gist_rainbow + cmap_d = pylab.cm.nipy_spectral dc = numpy.array(group_colors, dtype=int) dc.shape = (1,len(group_colors)) im_c = axd.matshow(dc, aspect='auto', origin='lower', cmap=cmap_d) @@ -1048,7 +1065,7 @@ def onpick1(event): #print ind1, len(ind1) cmap_r = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF', '#FFFF00', '#FF1493']) if len(unique.unique(ind1))>4: ### cmap_r is too few colors - cmap_r = pylab.cm.gist_rainbow_r + cmap_r = pylab.cm.nipy_spectral_r if len(unique.unique(ind1))==2: cmap_r = matplotlib.colors.ListedColormap(['w', 'k']) im_r = axr.matshow(dr, aspect='auto', origin='lower', cmap=cmap_r) @@ -1686,7 +1703,8 @@ def remoteImportData(filename,geneFilter=None,reverseOrder=True): except: return matrix, column_header, row_header, dataset_name, group_db, [], [] -def importData(filename,Normalize=False,reverseOrder=True,geneFilter=None,zscore=False): +def importData(filename,Normalize=False,reverseOrder=True,geneFilter=None, + zscore=False,forceClusters=False): global priorColumnClusters global priorRowClusters @@ -1718,11 +1736,11 @@ def importData(filename,Normalize=False,reverseOrder=True,geneFilter=None,zscore new_headers=[] temp_groups={} original_headers=t[1:] - if ('exp.' in filename or 'filteredExp.' in filename or 'MarkerGene' in filename):# and ':' not in data: + if ('exp.' in filename or 'filteredExp.' in filename or 'MarkerGene' in filename) and forceClusters==False:# and ':' not in data: if overwriteGroupNotations: ### Use groups file annotations over any header sample separation with a ":" for i in t: - if ':' in i: + if ':' in i: ### Don't annotate groups according to the clusters group,i = string.split(i,':') new_headers.append(i) temp_groups[i] = group @@ -1768,8 +1786,10 @@ def importData(filename,Normalize=False,reverseOrder=True,geneFilter=None,zscore prior.append(c1) index+=1 #prior=[] + if len(temp_groups)==0: ### Hence, no secondary group label combined with the sample name - group_db, column_header = assignGroupColors(original_headers) + if '-ReOrdered.txt' not in filename: ### Applies to cellHarmony UMAP and heatmap visualization + group_db, column_header = assignGroupColors(original_headers) #priorColumnClusters = dict(zip(column_header, prior)) priorColumnClusters = prior except Exception: @@ -1959,7 +1979,7 @@ def assignGroupColors(t): if len(group_number_db)>3: color_list = [] - cm = pylab.cm.get_cmap('gist_rainbow') #gist_ncar # binary + cm = pylab.cm.get_cmap('nipy_spectral') #gist_ncar # binary for i in range(len(group_number_db)): color_list.append(cm(1.*i/len(group_number_db))) # color will now be an RGBA tuple #color_list=[] @@ -2082,7 +2102,8 @@ def runUMAP(matrix, column_header,dataset_name,group_db,display=False,showLabels shutil.move(old_file,new_file) def tSNE(matrix, column_header,dataset_name,group_db,display=True,showLabels=False, - row_header=None,colorByGene=None,species=None,reimportModelScores=True,method="tSNE"): + row_header=None,colorByGene=None,species=None,reimportModelScores=True, + method="tSNE",maskGroups=None): try: prior_clusters = priorColumnClusters except Exception: prior_clusters=[] @@ -2108,7 +2129,7 @@ def tSNE(matrix, column_header,dataset_name,group_db,display=True,showLabels=Fal print root_dir+dataset_name+'-'+method+'_scores.txt' try: scores = importtSNEScores(root_dir+dataset_name+'-'+method+'_scores.txt'); print '...import finished' except Exception: - reimportModelScores=False; print '...import failed' + reimportModelScores=False; print '...no existing score file found' if reimportModelScores==False: X=matrix.T @@ -2145,6 +2166,10 @@ def tSNE(matrix, column_header,dataset_name,group_db,display=True,showLabels=Fal writetSNEScores(scores,root_dir+dataset_name+'-'+method+'_scores.txt') #pylab.scatter(scores[:,0], scores[:,1], 20, labels); + if maskGroups != None: + group_name,restricted_samples = maskGroups + dataset_name += '-'+group_name ### indicate the restricted group + ### Exclude samples with high TSNE deviations scoresT = zip(*scores) exclude={} @@ -2243,7 +2268,6 @@ def get_cmap(n, name='hsv'): RGB color; the keyword argument name must be a standard mpl colormap name.''' return pylab.cm.get_cmap(name, n) - if genePresent: dataset_name+='-'+colorByGene group_db={} @@ -2333,6 +2357,12 @@ def get_cmap(n, name='hsv'): group_names[group_name] = color except Exception: color = 'r'; label=None + if maskGroups != None: + base_name = sample_name + if ':' in sample_name: + base_name = string.split(base_name,':')[1] + if base_name not in restricted_samples: + exclude[i]=None ### Don't visualize this sample if i not in exclude: ax.plot(scores[i][0],scores[i][1],color=color,marker='o',markersize=marker_size,label=label,markeredgewidth=0,picker=True) #except Exception: print i, len(scores[pcB]);kill @@ -2414,7 +2444,7 @@ def excludeHighlyCorrelatedHits(x,row_header): def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, group_db, display=False, showLabels=True, algorithm='SVD', geneSetName=None, - species=None, pcA=1,pcB=2, colorByGene=None): + species=None, pcA=1,pcB=2, colorByGene=None, reimportModelScores=True): print "Performing Principal Component Analysis..." from numpy import mean,cov,double,cumsum,dot,linalg,array,rank @@ -2435,7 +2465,8 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, pcA-=1 pcB-=1 - + label1='' + label2='' """ Based in part on code from: http://glowingpython.blogspot.com/2011/07/principal-component-analysis-with-numpy.html @@ -2460,89 +2491,103 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, if algorithm == 'SVD': use_svd = True else: use_svd = False - #Mdif = matrix-matrix.mean(axis=0)# subtract the mean (along columns) - #M = (matrix-mean(matrix.T,axis=1)).T # subtract the mean (along columns) - Mdif = matrix/matrix.std() - Mdif = Mdif.T - u, s, vt = svd(Mdif, 0) - fracs = s**2/np.sum(s**2) - entropy = -sum(fracs*np.log(fracs))/np.log(np.min(vt.shape)) - - label1 = 'PC%i (%2.1f%%)' %(pcA+1, fracs[0]*100) - label2 = 'PC%i (%2.1f%%)' %(pcB+1, fracs[1]*100) - - #http://docs.scipy.org/doc/scipy/reference/sparse.html - #scipy.sparse.linalg.svds - sparse svd - #idx = numpy.argsort(vt[0,:]) - #print idx;sys.exit() # Use this as your cell order or use a density analysis to get groups - #### FROM LARSSON ######## - #100 most correlated Genes with PC1 - #print vt - PCsToInclude = 4 - correlated_db={} - allGenes={} - new_matrix = [] - new_headers = [] - added_indexes=[] - x = 0 - #100 most correlated Genes with PC1 - print 'exporting PCA loading genes to:',root_dir+'/PCA/correlated.txt' - exportData = export.ExportFile(root_dir+'/PCA/correlated.txt') - - matrix = zip(*matrix) ### transpose this back to normal - try: - while x0: - exportCustomGeneSet(geneSetName,species,allGenes) - print 'Exported geneset to "StoredGeneSets"' - except Exception: - pass - - ########################### + label1 = 'PC%i (%2.1f%%)' %(pcA+1, fracs[0]*100) + label2 = 'PC%i (%2.1f%%)' %(pcB+1, fracs[1]*100) - #if len(row_header)>20000: - #print '....Using eigenvectors of the real symmetric square matrix for efficiency...' - #[latent,coeff] = scipy.sparse.linalg.eigsh(cov(M)) - #scores=mlab.PCA(scores) + #http://docs.scipy.org/doc/scipy/reference/sparse.html + #scipy.sparse.linalg.svds - sparse svd + #idx = numpy.argsort(vt[0,:]) + #print idx;sys.exit() # Use this as your cell order or use a density analysis to get groups + + #### FROM LARSSON ######## + #100 most correlated Genes with PC1 + #print vt + PCsToInclude = 4 + correlated_db={} + allGenes={} + new_matrix = [] + new_headers = [] + added_indexes=[] + x = 0 + #100 most correlated Genes with PC1 + print 'exporting PCA loading genes to:',root_dir+'/PCA/correlated.txt' + exportData = export.ExportFile(root_dir+'/PCA/correlated.txt') + + matrix = zip(*matrix) ### transpose this back to normal + try: + while x0: + exportCustomGeneSet(geneSetName,species,allGenes) + print 'Exported geneset to "StoredGeneSets"' + except Exception: + pass - if use_svd == False: - [latent,coeff] = linalg.eig(cov(M)) - scores = dot(coeff.T,M) # projection of the data in the new space - else: - ### transform u into the same structure as the original scores from linalg.eig coeff - scores = vt + ########################### + + #if len(row_header)>20000: + #print '....Using eigenvectors of the real symmetric square matrix for efficiency...' + #[latent,coeff] = scipy.sparse.linalg.eigsh(cov(M)) + #scores=mlab.PCA(scores) + + if use_svd == False: + [latent,coeff] = linalg.eig(cov(M)) + scores = dot(coeff.T,M) # projection of the data in the new space + else: + ### transform u into the same structure as the original scores from linalg.eig coeff + scores = vt + + writetSNEScores(scores,root_dir+dataset_name+'-PCA_scores.txt') fig = pylab.figure() ax = fig.add_subplot(111) @@ -2570,6 +2615,7 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, #samples = list(column_header) ### Color By Gene if colorByGene != None: + print 'Coloring based on feature expression.' gene_translation_db={} matrix = numpy.array(matrix) min_val = matrix.min() ### min val @@ -2605,7 +2651,7 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, else: if numberGenesPresent==2: cm = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF']) - #cm = matplotlib.colors.ListedColormap(['w', 'k']) + #cm = matplotlib.colors.ListedColormap(['w', 'k']) ### If you want to hide one of the groups elif numberGenesPresent==3: cm = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) elif numberGenesPresent==4: @@ -2617,7 +2663,13 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, elif numberGenesPresent==7: cm = matplotlib.colors.ListedColormap(['#88BF47', '#63C6BB', '#29C3EC', '#3D3181', '#7B4976','#FEBC18', '#EE2C3C']) else: - cmap_c = pylab.cm.gist_rainbow + cm = pylab.cm.get_cmap('gist_rainbow') + + def get_cmap(n, name='hsv'): + '''Returns a function that maps each index in 0, 1, ..., n-1 to a distinct + RGB color; the keyword argument name must be a standard mpl colormap name.''' + return pylab.cm.get_cmap(name, n) + if genePresent: dataset_name+='-'+colorByGene group_db={} @@ -2641,6 +2693,7 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, ranges.append(r) iz+=bin_size color_db = {} + colors = get_cmap(len(genes)) for i in range(len(ranges)): if i==0: color = '#C0C0C0' @@ -2651,7 +2704,10 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, #color = cm(1.*(i+1)/len(ranges)) else: if i>2: - color = cm(k) + if len(genes)<8: + color = cm(k) + else: + color = colors(k) else: color = '#C0C0C0' color_db[ranges[i]] = color @@ -4265,7 +4321,7 @@ def timestamp(): def runPCAonly(filename,graphics,transpose,showLabels=True,plotType='3D',display=True, algorithm='SVD',geneSetName=None, species=None, zscore=True, colorByGene=None, - reimportModelScores=True, separateGenePlots=False): + reimportModelScores=True, separateGenePlots=False, forceClusters=False, maskGroups=None): global root_dir global graphic_link @@ -4302,7 +4358,7 @@ def runPCAonly(filename,graphics,transpose,showLabels=True,plotType='3D',display #print traceback.format_exc();sys.exit() geneFilter = None ### It won't import the matrix, basically - matrix, column_header, row_header, dataset_name, group_db = importData(filename,zscore=zscore,geneFilter=geneFilter) + matrix, column_header, row_header, dataset_name, group_db = importData(filename,zscore=zscore,geneFilter=geneFilter,forceClusters=forceClusters) if transpose == False: ### We normally transpose the data, so if True, we don't transpose (I know, it's confusing) matrix = map(numpy.array, zip(*matrix)) ### coverts these to tuples column_header, row_header = row_header, column_header @@ -4323,7 +4379,20 @@ def runPCAonly(filename,graphics,transpose,showLabels=True,plotType='3D',display ### Show the last one tSNE(numpy.array(matrix),column_header,dataset_name,group_db,display=True, showLabels=showLabels,row_header=row_header,colorByGene=gene,species=species, - reimportModelScores=reimportModelScores,method=algorithm) + reimportModelScores=reimportModelScores,method=algorithm) + elif maskGroups!=None: + """ Mask the samples not present in each examined group below """ + import ExpressionBuilder + sample_group_db = ExpressionBuilder.simplerGroupImport(maskGroups) + group_sample_db = {} + for sample in sample_group_db: + try: group_sample_db[sample_group_db[sample]].append(sample) + except: group_sample_db[sample_group_db[sample]] = [sample] + for group in group_sample_db: + restricted_samples = group_sample_db[group] + tSNE(numpy.array(matrix),column_header,dataset_name,group_db,display=display, + showLabels=showLabels,row_header=row_header,colorByGene=colorByGene,species=species, + reimportModelScores=reimportModelScores,method=algorithm,maskGroups=(group,restricted_samples)) else: tSNE(numpy.array(matrix),column_header,dataset_name,group_db,display=display, showLabels=showLabels,row_header=row_header,colorByGene=colorByGene,species=species, @@ -4336,11 +4405,13 @@ def runPCAonly(filename,graphics,transpose,showLabels=True,plotType='3D',display print traceback.format_exc() PrincipalComponentAnalysis(numpy.array(matrix), row_header, column_header, dataset_name, group_db, display=display, showLabels=showLabels, algorithm=algorithm, - geneSetName=geneSetName, species=species, colorByGene=colorByGene) + geneSetName=geneSetName, species=species, colorByGene=colorByGene, + reimportModelScores=reimportModelScores) else: PrincipalComponentAnalysis(numpy.array(matrix), row_header, column_header, dataset_name, group_db, display=display, showLabels=showLabels, algorithm=algorithm, - geneSetName=geneSetName, species=species, colorByGene=colorByGene) + geneSetName=geneSetName, species=species, colorByGene=colorByGene, + reimportModelScores=reimportModelScores) return graphic_link @@ -4553,7 +4624,7 @@ def buildGraphFromSIF(mod,species,sif_filename,ora_input_dir): output_filename = iGraphSimple(sif_filename,fold_db,pathway_name) except Exception: print 'igraph export failed due to an unknown error' - #print traceback.format_exc() + print traceback.format_exc() try: displaySimpleNetwork(sif_filename,fold_db,pathway_name) except Exception: pass ### GraphViz problem return output_filename @@ -4997,8 +5068,8 @@ def barchart(filename,index1,index2,x_axis,y_axis,title,display=False,color1='Sk header1=header[index1] header2=header[index2] else: - reference_data.append(int(t[index1])) - query_data.append(int(t[index2])) + reference_data.append(float(t[index1])) + query_data.append(float(t[index2])) name = t[0] if '_vs_' in name and 'event_summary' not in filename: name = string.split(name,'_vs_')[0] @@ -5031,10 +5102,10 @@ def barchart(filename,index1,index2,x_axis,y_axis,title,display=False,color1='Sk if output==False: pylab.savefig(filename[:-4]+'.pdf') - pylab.savefig(filename[:-4]+'.png') + #pylab.savefig(filename[:-4]+'.png') else: pylab.savefig(output[:-4]+'.pdf') - pylab.savefig(output[:-4]+'.png') + #pylab.savefig(output[:-4]+'.png') if display: print 'Exporting:',filename @@ -5622,6 +5693,15 @@ def importGeneList(gene_list_file,n=20): genes+=(n-len(genes))*[gene] genesets.append(genes) return genesets + +def simpleListImport(filename): + genesets=[] + genes=[] + for line in open(filename,'rU').xreadlines(): + gene = line.rstrip() + gene = string.split(gene,'\t')[0] + genes.append(gene) + return genes def customClean(filename): fn = filepath(filename) @@ -7008,12 +7088,39 @@ def removeMarkerFinderDoublets(heatmap_file,diff=1): try: sampleIndexSelection.filterFile(input_file,output_file,remove) except: sampleIndexSelection.filterFile(input_file,output_file,remove_alt) +def exportTFcorrelations(filename,TF_file,threshold): + + eo = export.ExportFile(filename[:-4]+'-TF-correlations.txt') + TFs = simpleListImport(TF_file) + x, column_header, row_header, dataset_name, group_db = importData(filename) + + ### For methylation data or other data with redundant signatures, remove these and only report the first one + with warnings.catch_warnings(): + warnings.filterwarnings("ignore",category=RuntimeWarning) ### hides import warnings + D1 = numpy.corrcoef(x) + i=0 + correlation_pairs=[] + for score_ls in D1: + k=0 + for v in score_ls: + if str(v)!='nan': + if k!=i: + #print row_header[i], row_header[k], v + if row_header[i] in TFs or row_header[k] in TFs: + #correlation_pairs.append([row_header[i],row_header[k],v]) + if v<(-1*threshold) or v>threshold: + eo.write(row_header[i]+'\t'+row_header[k]+'\t'+str(v)+'\n') + k+=1 + i+=1 + eo.close() + if __name__ == '__main__': filename='' + index1=2;index2=3; x_axis='Number of Differentially Expressed Genes'; y_axis = 'Comparisons'; title='Hippocampus - Number of Differentially Expressed Genes' #OutputFile = export.findParentDir(filename) #OutputFile = export.findParentDir(OutputFile[:-1])+'/test.pdf' - + #exportTFcorrelations('/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/SuperPan/ExpressionInput/exp.Cdt1-2139-genes.txt','/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Marie.Dominique/TF-to-gene/228-tfs.txt',0.1);sys.exit() #stackedbarchart(filename,display=True,output=OutputFile);sys.exit() #barchart(filename,index1,index2,x_axis,y_axis,title,display=True,color1='IndianRed',color2='SkyBlue');sys.exit() diff=0.7 @@ -7022,7 +7129,7 @@ def removeMarkerFinderDoublets(heatmap_file,diff=1): #removeMarkerFinderDoublets('/Volumes/salomonis2/Nancy_ratner/2mo-NF/exp.Figure_SX-ICGS-MarkerFinder.filt.txt',diff=diff);sys.exit() #outputForGOElite('/Users/saljh8/Desktop/R412X/completed/centroids.WT.R412X.median.txt');sys.exit() #simpleStatsSummary('/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/HCA/Mean-Comparisons/ExpressionInput/MergedFiles.Counts.UMI.txt');sys.exit() - a = '/Volumes/salomonis2/Grimes/Andre-10X/PROJ-00504/Project_s1115g01001_6lib_11lane_BCL/Combined_Captures/Day0/cellHarmony/exp.MarkerFinder-cellHarmony-reference__Day0-CPTT-log2-ReOrdered-Query.txt' + a = '/Volumes/salomonis2/TabulaMuris/Smart-Seq2_Nextera/CPTT-Files/all/header.txt' b = '/Volumes/salomonis2/Immune-10x-data-Human-Atlas/Bone-Marrow/Stuart/Browser/ExpressionInput/HS-compatible_symbols.txt' #b = '/data/salomonis2/GSE107727_RAW-10X-Mm/filtered-counts/ExpressionInput/Mm_compatible_symbols.txt' #a = '/Volumes/salomonis2/Immune-10x-data-Human-Atlas/Bone-Marrow/Stuart/Browser/head.txt' @@ -7030,9 +7137,9 @@ def removeMarkerFinderDoublets(heatmap_file,diff=1): #convertSymbolLog(a,b);sys.exit() #returnIntronJunctionRatio('/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Fluidigm_scRNA-Seq/12.09.2107/counts.WT-R412X.txt');sys.exit() #geneExpressionSummary('/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Ly6g/combined-ICGS-Final/ExpressionInput/DEGs-LogFold_1.0_rawp');sys.exit() - b = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/SuperPan/ICGS-NMF-SLAM/groups.February2019.txt' - a = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/SuperPan/ICGS-NMF-SLAM/exp.FinalMarkerHeatmap.txt' - #convertGroupsToBinaryMatrix(b,a,cellHarmony=False);sys.exit() + b = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Liron/MRD-positive-GE/groups.GE.txt' + a = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Liron/MRD-positive-GE/exp.GE.txt' + convertGroupsToBinaryMatrix(b,a,cellHarmony=False);sys.exit() a = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Leucegene/July-2017/tests/events.txt' b = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Leucegene/July-2017/tests/clusters.txt' #simpleCombineFiles('/Users/saljh8/Desktop/dataAnalysis/Collaborative/Jose/NewTranscriptome/CombinedDataset/ExpressionInput/Events-LogFold_0.58_rawp')