diff --git a/AltAnalyze.py b/AltAnalyze.py index c6f975a..c538827 100755 --- a/AltAnalyze.py +++ b/AltAnalyze.py @@ -5114,7 +5114,7 @@ def AltAnalyzeSetup(skip_intro): if 'remoteViewer' == skip_intro: if os.name == 'nt': callWXPython() - elif os.name == 'ntX': + elif os.name == 'nt': package_path = filepath('python') win_package_path = string.replace(package_path,'python','AltAnalyzeViewer.exe') import subprocess @@ -8597,7 +8597,20 @@ def unpackConfigFiles(): are written to the user home directory in the folder 'altanalyze'.""" fn = filepath('Config/options.txt') ### See if a Config folder is already available + print fn fileExists = os.path.isfile(fn) + print 'Options.txt found in local Config = ',fileExists, + + try: + for line in open(fn,'r').readlines(): + break + print '...confirmed found' + except: + print '...confirmed not found' + + if 'AltAnalyze.app' in os.getcwd(): ### Overcomes potential problems with the above + fileExists = True + if fileExists == False: import subprocess import shutil diff --git a/AltDatabase/kallisto/0.43.1-splice/Mac/bin/kallisto b/AltDatabase/kallisto/0.43.1-splice/Mac/bin/kallisto index 7d11bb8..842931e 100755 Binary files a/AltDatabase/kallisto/0.43.1-splice/Mac/bin/kallisto and b/AltDatabase/kallisto/0.43.1-splice/Mac/bin/kallisto differ diff --git a/Config/defaults-expr.txt b/Config/defaults-expr.txt index a2d274a..7bb41ec 100755 --- a/Config/defaults-expr.txt +++ b/Config/defaults-expr.txt @@ -1 +1 @@ -array_type dabg_p rpkm_threshold gene_exp_threshold exon_exp_threshold exon_rpkm_threshold expression_threshold perform_alt_analysis analyze_as_groups expression_data_format normalize_feature_exp normalize_gene_data avg_all_for_ss include_raw_data probability_algorithm FDR_statistic batch_effects marker_finder visualize_results run_lineage_profiler run_goelite exon 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately AltMouse 0.75 NA NA NA NA 1 yes NA log NA NA NA yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately gene 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately 3'array NA NA NA NA NA NA NA NA log NA None NA no moderated t-test Benjamini-Hochberg no yes yes yes run immediately junction 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately RNASeq NA 1 200 5 0.5 5 yes yes non-log RPKM NA known exons no moderated t-test Benjamini-Hochberg no yes yes yes run immediately \ No newline at end of file +garray_type dabg_p rpkm_threshold gene_exp_threshold exon_exp_threshold exon_rpkm_threshold expression_threshold perform_alt_analysis analyze_as_groups expression_data_format normalize_feature_exp normalize_gene_data avg_all_for_ss include_raw_data probability_algorithm FDR_statistic batch_effects marker_finder visualize_results run_lineage_profiler run_goelite exon 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately AltMouse 0.75 NA NA NA NA 1 yes NA log NA NA NA yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately gene 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately 3'array NA NA NA NA NA NA NA NA log NA None NA no moderated t-test Benjamini-Hochberg no yes yes yes run immediately junction 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately RNASeq NA 1 200 5 0.5 5 yes yes non-log RPKM NA known exons no moderated t-test Benjamini-Hochberg no yes yes yes run immediately \ No newline at end of file diff --git a/ExpressionBuilder.py b/ExpressionBuilder.py index 405c4d8..94389dd 100644 --- a/ExpressionBuilder.py +++ b/ExpressionBuilder.py @@ -185,6 +185,14 @@ def calculate_expression_measures(expr_input_dir,expr_group_dir,experiment_name, ### differentiate data from column headers if x == 1: fold_data = fold_data[1:]; fold_data2=[] + """ + if len(array_names) != len(fold_data): + diff = len(fold_data)-len(fold_data) + fold_data+=diff*[''] + if arrayid == 'FOXN4|1/2|10F06_ENSP00000299162': + print fold_data;sys.exit() + """ + for fold in fold_data: fold = string.replace(fold,'"','') try: @@ -3133,19 +3141,40 @@ def filterByLocalJunctionExp(gene,features): if prior_gene == '!ENSG00000198001': ### For testing novel_junc_count = 0 + all_junc_count = 0 for junc in feature_exp_db: if "_" in junc: novel_junc_count+=1 - if novel_junc_count>5000: + all_junc_count+=1 + if novel_junc_count>1000: ### Indicates genomic variation resulting in broad diversity ### Will prevent function from running in a reasonable amount of time + #print "skipping" pass else: + start_time = time.time() + #print novel_junc_count, all_junc_count, prior_gene, filterByLocalJunctionExp(prior_gene,feature_exp_db) + end_time = time.time(); time_diff = int(end_time-start_time) + #print time_diff #try: gene_junction_denom[prior_gene] = [max(value) for value in zip(*gene_junction_denom[prior_gene])] # sum the junction counts for all junctions across the gene #except Exception: pass if platform == 'RNASeq': - - filterByLocalJunctionExp(prior_gene,feature_exp_db) + novel_junc_count = 0 + all_junc_count = 0 + for junc in feature_exp_db: + if "_" in junc: novel_junc_count+=1 + all_junc_count+=1 + if novel_junc_count>1000: + ### Indicates genomic variation resulting in broad diversity + ### Will prevent function from running in a reasonable amount of time + #print "skipping" + pass + else: + start_time = time.time() + #print novel_junc_count, all_junc_count, prior_gene, + filterByLocalJunctionExp(prior_gene,feature_exp_db) + end_time = time.time(); time_diff = int(end_time-start_time) + #print time_diff else: compareJunctionExpression(prior_gene) feature_exp_db={} @@ -3170,7 +3199,7 @@ def filterByLocalJunctionExp(gene,features): except Exception: graphic_links=[] """ print len(exported)/2,'junctions exported' #,len(retained_introns)/2, 'retained introns exported...' - return + return [] def getGeneAnnotations(species): gene_annotations={} diff --git a/RNASeq.py b/RNASeq.py index 70efc47..cc89184 100644 --- a/RNASeq.py +++ b/RNASeq.py @@ -5679,8 +5679,8 @@ def predictCellTypesFromClusters(icgs_groups_path, goelite_path): column_method = 'hopach' species = 'Hs' excludeCellCycle = False - icgs_groups_path='/Volumes/salomonis2/CCHMC-Collaborations/Rafi-Kopan-10X-Rhesus/10X-Kopan-Monkey-Kidney-Cortex-Nuclei-20190506-3v3rhe/10X-Kopan-Monkey-Kidney-Cortex-Nuclei/outs/soupX-without_GENEL-LIST-0.5/10X-Kopan-Monkey-Kidney-Cortex-Nuclei-0.5_matrix_CPTT/ICGS-NMF_cosine_cc/FinalGroups.txt' - goelite_path='/Volumes/salomonis2/CCHMC-Collaborations/Rafi-Kopan-10X-Rhesus/10X-Kopan-Monkey-Kidney-Cortex-Nuclei-20190506-3v3rhe/10X-Kopan-Monkey-Kidney-Cortex-Nuclei/outs/soupX-without_GENEL-LIST-0.5/10X-Kopan-Monkey-Kidney-Cortex-Nuclei-0.5_matrix_CPTT/ICGS-NMF_cosine_cc/GO-Elite/clustering/exp.FinalMarkerHeatmap_all/GO-Elite_results/pruned-results_z-score_elite.txt' + icgs_groups_path='/Users/saljh8/Downloads/Correlation_files_BRCA/ICGS-NMF/FinalGroups.txt' + goelite_path='/Users/saljh8/Downloads/Correlation_files_BRCA/ICGS-NMF/GO-Elite/clustering/exp.FinalMarkerHeatmap_all/GO-Elite_results/pruned-results_z-score_elite.txt' predictCellTypesFromClusters(icgs_groups_path, goelite_path);sys.exit() platform = 'RNASeq'; graphic_links=[('','/Volumes/HomeBackup/CCHMC/PBMC-10X/ExpressionInput/SamplePrediction/DataPlots/Clustering-33k_CPTT_matrix-CORRELATED-FEATURES-iterFilt-hierarchical_cosine_cosine.txt')] """ @@ -5690,7 +5690,7 @@ def predictCellTypesFromClusters(icgs_groups_path, goelite_path): """ import UI; import multiprocessing as mlp - #runKallisto('Mm','BoneMarrow','/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/altanalyze/Mm-FASTQ','/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/altanalyze/Mm-FASTQ',mlp);sys.exit() + runKallisto('Mm','ALP-ILC','/Volumes/salomonis2/PublicDatasets/GSE113765-ILC-Mm/bulk-RNASeq/','/Volumes/salomonis2/PublicDatasets/GSE113765-ILC-Mm/bulk-RNASeq/',mlp);sys.exit() runKallisto('Hs','BreastCancer','/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/BreastCancerDemo/FASTQs/input','/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/BreastCancerDemo/FASTQs/input',mlp);sys.exit() results_file = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/l/July-2017/PSI/test/Clustering-exp.round2-Guide3-hierarchical_cosine_correlation.txt' diff --git a/UI.py b/UI.py index 19e8882..3a931e2 100755 --- a/UI.py +++ b/UI.py @@ -2539,7 +2539,7 @@ def checkbuttoncallback(tag,state,checkbuttoncallback=self.checkbuttoncallback,o quit_win.pack(side = 'right', padx =10, pady = 5) button_text = 'Help' - url = 'http://www.altanalyze.org/help_main.htm'; self.url = url + url = 'https://altanalyze.readthedocs.io/en/latest/'; self.url = url pdf_help_file = 'Documentation/AltAnalyze-Manual.pdf'; pdf_help_file = filepath(pdf_help_file); self.pdf_help_file = pdf_help_file try: help_button = Button(self._parent, text=button_text, command=self.GetHelpTopLevel); help_button.pack(side = 'left', padx = 5, pady = 5) diff --git a/build_scripts/setup_binary.py b/build_scripts/setup_binary.py index 33eb632..c7d3cd3 100755 --- a/build_scripts/setup_binary.py +++ b/build_scripts/setup_binary.py @@ -8,7 +8,7 @@ _script = 'AltAnalyze.py' _appName = "AltAnalyze" -_appVersion = '2.1.3' +_appVersion = '2.1.4.1' _appDescription = "AltAnalyze is a freely available, open-source and cross-platform program that allows you to processes raw bulk or single-cell RNASeq and " _appDescription +="microarray data, identify predicted alternative splicing or alternative promoter changes and " _appDescription +="view how these changes may affect protein sequence, domain composition, and microRNA targeting." @@ -68,6 +68,7 @@ options = {"py2app": {"excludes": excludes, "includes": includes, + 'plist': 'Info.plist', #"frameworks": frameworks, #"resources": resources, #"argv_emulation": True, diff --git a/import_scripts/ChromiumProcessing.py b/import_scripts/ChromiumProcessing.py index ea3b4e4..293ac57 100755 --- a/import_scripts/ChromiumProcessing.py +++ b/import_scripts/ChromiumProcessing.py @@ -13,7 +13,7 @@ except: print ('Missing the h5py library (hdf5 support)...') -def import10XSparseMatrix(matrices_dir,genome,dataset_name, expFile=None, log=True): +def import10XSparseMatrix(matrices_dir,genome,dataset_name, expFile=None, log=True, geneIDs=False): start_time = time.time() if '.h5' in matrices_dir: @@ -50,9 +50,11 @@ def import10XSparseMatrix(matrices_dir,genome,dataset_name, expFile=None, log=Tr barcodes = [row[0] for row in csv.reader(gzip.open(barcodes_path), delimiter="\t")] else: gene_ids = [row[0] for row in csv.reader(open(genes_path), delimiter="\t")] - print gene_ids[0:10] gene_names = [row[1] for row in csv.reader(open(genes_path), delimiter="\t")] barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")] + + if geneIDs: + gene_names = gene_ids #barcodes = map(lambda x: string.replace(x,'-1',''), barcodes) ### could possibly cause issues with comparative analyses matrices_dir = os.path.abspath(os.path.join(matrices_dir, os.pardir)) @@ -128,15 +130,19 @@ def calculateCPTT(val,barcode_sum): filter_file=None genome = 'hg19' dataset_name = '10X_filtered' + geneID = False if len(sys.argv[1:])<=1: ### Indicates that there are insufficient number of command-line arguments print "Insufficient options provided";sys.exit() #Filtering samples in a datasets #python 10XProcessing.py --i /Users/test/10X/outs/filtered_gene_bc_matrices/ --g hg19 --n My10XExperiment else: - options, remainder = getopt.getopt(sys.argv[1:],'', ['i=','g=','n=']) + options, remainder = getopt.getopt(sys.argv[1:],'', ['i=','g=','n=','geneID=']) #print sys.argv[1:] for opt, arg in options: if opt == '--i': matrices_dir=arg elif opt == '--g': genome=arg elif opt == '--n': dataset_name=arg - import10XSparseMatrix(matrices_dir,genome,dataset_name) \ No newline at end of file + elif opt == '--geneID': + geneID = True + + import10XSparseMatrix(matrices_dir,genome,dataset_name,geneIDs = geneID) \ No newline at end of file diff --git a/import_scripts/mergeFiles.py b/import_scripts/mergeFiles.py index 64b6c4f..5b88f01 100755 --- a/import_scripts/mergeFiles.py +++ b/import_scripts/mergeFiles.py @@ -72,7 +72,12 @@ def combineAllLists(files_to_merge,original_filename,includeColumns=False): file = string.split(filename,'\\')[-1][:-4] for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) - t = string.split(data,'\t') + if '\t' in data: + t = string.split(data,'\t') + elif ',' in data: + t = string.split(data,',') + else: + t = string.split(data,'\t') if x==0: if data[0]!='#': x=1 @@ -201,7 +206,12 @@ def combineUniqueAllLists(files_to_merge,original_filename): file = string.split(filename,'\\')[-1][:-4] for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) - t = string.split(data,'\t') + if '\t' in data: + t = string.split(data,'\t') + elif ',' in data: + t = string.split(data,',') + else: + t = string.split(data,'\t') if x==0: if data[0]!='#': x=1 diff --git a/import_scripts/sampleIndexSelection.py b/import_scripts/sampleIndexSelection.py index d1c8df1..1e6da3d 100755 --- a/import_scripts/sampleIndexSelection.py +++ b/import_scripts/sampleIndexSelection.py @@ -145,9 +145,16 @@ def filterFile(input_file,output_file,filter_names,force=False,calculateCentroid means={} for cluster in group_index_db: #### group_index_db[cluster] is all of the indeces for samples in a noted group, cluster is the actual cluster name (not number) - try: mean=statistics.avg(map(lambda x: float(filtered_values[x]), group_index_db[cluster])) - except: - continue + raw_values = map(lambda x: filtered_values[x], group_index_db[cluster]) + raw_values2=[] + for vx in raw_values: + if vx != '': + raw_values2.append(float(vx)) + + if len(raw_values2)>2: + mean=statistics.avg(raw_values2) + else: + mean = "" #mean = map(lambda x: filtered_values[uid][x], group_index_db[cluster]) ### Only one value means[cluster]=mean mean_matrix.append(str(mean)) diff --git a/stats_scripts/cellHarmony.py b/stats_scripts/cellHarmony.py index 64abd6c..eb80202 100755 --- a/stats_scripts/cellHarmony.py +++ b/stats_scripts/cellHarmony.py @@ -78,7 +78,7 @@ def manage_louvain_alignment(species,platform,query_exp_file,exp_output, ref = reference query = query_exp_file - louvain_results = cluster_corr.find_nearest_cells(ref, + louvain_results, ref_results = cluster_corr.find_nearest_cells(ref, query, gene_list=gene_list, num_neighbors=10, @@ -87,6 +87,7 @@ def manage_louvain_alignment(species,platform,query_exp_file,exp_output, min_cluster_correlation=-1, genome=species) cluster_corr.write_results_to_file(louvain_results, output_classification_file, labels=customLabels) + cluster_corr.write_results_to_file(ref_results, output_classification_file[:-4]+'-reference.txt', labels=customLabels) try: LineageProfilerIterate.harmonizeClassifiedSamples(species, reference, query_exp_file, output_classification_file,fl=fl) diff --git a/stats_scripts/cluster_corr.py b/stats_scripts/cluster_corr.py index 64912db..56aebbc 100755 --- a/stats_scripts/cluster_corr.py +++ b/stats_scripts/cluster_corr.py @@ -111,7 +111,7 @@ def add_labels(barcode): print("\t".join( ("Query Barcode", "Ref Barcode", "Correlation", "Query Partition", "Ref Partition") ), file=f) for q in results.keys(): print("\t".join( (q, - results[q]['barcode'], + results[q]['barcode'].replace('.Reference',''), str(results[q]['correlation']), str(results[q]['query_partition']), str(results[q]['ref_partition'])) ), file=f) @@ -119,8 +119,8 @@ def add_labels(barcode): with open(filename, 'w') as f: print("\t".join( ("Query Barcode", "Ref Barcode", "Correlation", "Query Partition", "Ref Partition", "Label") ), file=f) for q in results.keys(): - print("\t".join( (q, - results[q]['barcode'], + print("\t".join( (q.replace('.Reference',''), + results[q]['barcode'].replace('.Reference',''), str(results[q]['correlation']), str(results[q]['query_partition']), str(results[q]['ref_partition']), diff --git a/stats_scripts/preAligned.py b/stats_scripts/preAligned.py index 20e0052..ac90a15 100644 --- a/stats_scripts/preAligned.py +++ b/stats_scripts/preAligned.py @@ -98,7 +98,10 @@ def exportCellClassifications(output_file,query_cells,filtered_query_cells,repre CI = query_cells[query_barcode] cluster_number = CI.ClusterNumber() label = CI.Label() - ref_barcode = representative_refcluster_cell[label][-1] + try: + ref_barcode = representative_refcluster_cell[label][-1] + except: + continue values = [query_barcode,ref_barcode,'1.0',cluster_number,cluster_number,label] o.write(string.join(values,'\t')+'\n') o.close() @@ -153,29 +156,40 @@ def importCelltoClusterAnnotations(filename): t = string.split(data,',') if firstRow: ci = t.index('cell_id') - cn = t.index('cluster_number') + try: cn = t.index('cluster_number') + except: cn = 'False' try: cm = t.index('cluster_name') - except: cm = False + except: cm = 'False' + try: cnm = t.index('ClustNameNum') + except: cnm = 'False' + try: cnm = t.index('label') + except: pass dn = t.index('dataset_name') dt = t.index('dataset_type') firstRow = False else: cell_id = t[ci] - cluster_number = t[cn] + try: cluster_number = t[cn] + except: cluster_number = 'False' dataset_name = t[dn] dataset_type = t[dt] - if cm != False: + if cnm !='False': + label = t[cnm] + if cluster_number == 'False': + cluster_number = label + elif cm != False: cluster_name = t[cm] label = cluster_name + '_c'+cluster_number + elif cluster_number == False: + label = t[cm] else: label = 'c'+cluster_number - if string.lower(dataset_type)[0] == 'r': dataset_type = 'Reference' reference_dataset = dataset_name CI = CellInfo(cell_id, cluster_number, dataset_name, dataset_type, label) refererence_cells[cell_id]=CI - else: + elif string.lower(dataset_type)[0] == 'q': dataset_type = 'Query' query_dataset = dataset_name CI = CellInfo(cell_id, cluster_number, dataset_name, dataset_type, label) diff --git a/stats_scripts/segregateCellsBySex.py b/stats_scripts/segregateCellsBySex.py new file mode 100644 index 0000000..311ea99 --- /dev/null +++ b/stats_scripts/segregateCellsBySex.py @@ -0,0 +1,101 @@ +import sys,string,os +sys.path.insert(1, os.path.join(sys.path[0], '..')) ### import parent dir dependencies +import export +import unique +import traceback +import numpy as np + +""" Determine and segregate single-cell based on sex associated gene expression """ + +def cleanUpLine(line): + line = string.replace(line,'\n','') + line = string.replace(line,'\c','') + data = string.replace(line,'\r','') + data = string.replace(data,'"','') + return data + +def computeOnSexTranscripts(fn): + """ Import a flat single-cell expression matrix """ + female = ['TSIX','XIST','ENSG00000229807','ENSMUSG00000086503','ENSG00000270641','ENSMUSG00000085715'] + male = ['EIF2S3Y','EDX3Y','UTY','ENSMUSG00000069049','ENSMUSG00000069049', 'ENSG00000067048', 'ENSMUSG00000069045','ENSMUSP00000070012', 'ENSG00000183878'] + informative_genes = male+female + female_data = [] + male_data = [] + firstRow=True + for line in open(fn,'rU').xreadlines(): + data = cleanUpLine(line) + t = string.split(data,'\t') + if firstRow: + header = t[1:] + firstRow = False + else: + gene = t[0] + gene_upper = string.upper(gene) + if gene_upper in informative_genes: + values = map(float,t[1:]) + if gene_upper in male: + male_data.append(values) + else: + female_data.append(values) + + male_data = np.array(male_data) + female_data = np.array(female_data) + male_data = np.average(male_data, axis=0) + female_data = np.average(female_data, axis=0) + ratios = [i - j for i, j in zip(female_data, male_data)] + + female_count=0 + male_count=0 + uknown=0 + i=0 + females=[] + males=[] + unknowns=[] + for ratio in ratios: + if ratio>0: + female_count+=1 + females.append(header[i]) + elif ratio<0: + male_count+=1 + males.append(header[i]) + else: + uknown+=1 + unknowns.append(header[i]) + i+=1 + + male_ratio = 100*(male_count*1.00)/((male_count*1.00)+(female_count*1.00)+uknown*1.00) + female_ratio = 100*(female_count*1.00)/((male_count*1.00)+(female_count*1.00)+uknown*1.00) + uknown_ratio = 100*(uknown*1.00)/((male_count*1.00)+(female_count*1.00)+uknown*1.00) + + print 'Male cell percentage:',male_ratio + print 'Female cell percentage:',female_ratio + print 'Unknown cell percentage:',uknown_ratio + + from import_scripts import sampleIndexSelection + sampleIndexSelection.filterFile(fn,fn[:-4]+'-female.txt',['row_clusters-flat']+females,force=True) + sampleIndexSelection.filterFile(fn,fn[:-4]+'-male.txt',['row_clusters-flat']+males,force=True) + + export_object = open(fn[:-4]+'-groups.txt','w') + def writeData(header,group): + export_object.write(header+'\t'+group+'\n') + map(lambda x: writeData(x,'male'),males) + map(lambda x: writeData(x,'female'),females) + map(lambda x: writeData(x,'unknown'),unknowns) + export_object.close() + +if __name__ == '__main__': + ################ Comand-line arguments ################ + import getopt + + if len(sys.argv[1:])<=1: ### Indicates that there are insufficient number of command-line arguments + print 'WARNING!!!! Too commands supplied.' + + else: + options, remainder = getopt.getopt(sys.argv[1:],'', ['i=']) + #print sys.argv[1:] + for opt, arg in options: + if opt == '--i': + inputFile = arg + + computeOnSexTranscripts(inputFile) + diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..6a6c307 --- /dev/null +++ b/test.txt @@ -0,0 +1,5 @@ +name harold bob frank sally kim tim +a 0 0 1 2 0 5 +b 0 0 1 2 0 5 +c 0 0 1 2 0 5 +d 0 0 1 2 0 5 diff --git a/visualization_scripts/clustering.py b/visualization_scripts/clustering.py index 780bc34..0d801d7 100644 --- a/visualization_scripts/clustering.py +++ b/visualization_scripts/clustering.py @@ -815,7 +815,9 @@ def formatpval(p): feature_id = gene_to_symbol[feature_id][0] if original_feature_id in justShowTheseIDs: feature_id = original_feature_id - if display_label_names and 'ticks' not in justShowTheseIDs: + if len(justShowTheseIDs)<40: + axm.text(x.shape[1]-0.4, i-radj, feature_id,fontsize=column_fontsize, color=color,picker=True) ### When not clustering rows + elif display_label_names and 'ticks' not in justShowTheseIDs: if alternate==1: buffer=1.2; alternate=2 elif alternate==2: buffer=2.4; alternate=3 elif alternate==3: buffer=3.6; alternate=4 @@ -993,13 +995,13 @@ def onpick1(event): cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF']) #cmap_c = matplotlib.colors.ListedColormap(['#7CFC00','k']) cmap_c = matplotlib.colors.ListedColormap(['w', 'k']) + elif len(unique.unique(ind2))==3: ### cmap_c is too few colors + cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) + cmap_c = matplotlib.colors.ListedColormap(['b', 'y', 'r']) elif len(unique.unique(ind2))>0: ### cmap_c is too few colors #cmap_c = pylab.cm.Paired cmap_c = PairedColorMap() """ - elif len(unique.unique(ind2))==3: ### cmap_c is too few colors - cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) - #cmap_c = matplotlib.colors.ListedColormap(['r', 'y', 'b']) elif len(unique.unique(ind2))==4: ### cmap_c is too few colors cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C','#FEBC18']) #['#FEBC18','#EE2C3C','#3D3181','#88BF47'] #cmap_c = matplotlib.colors.ListedColormap(['k', 'w', 'w', 'w']) @@ -1045,13 +1047,13 @@ def onpick1(event): if len(unique.unique(ind2_clust))==2: ### cmap_c is too few colors #cmap_c = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF']) cmap_c = matplotlib.colors.ListedColormap(['w', 'k']) + elif len(unique.unique(ind2_clust))==3: ### cmap_c is too few colors + cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) + cmap_c = matplotlib.colors.ListedColormap(['b', 'y', 'r']) elif len(unique.unique(ind2_clust))>0: ### cmap_c is too few colors #cmap_c = pylab.cm.Paired cmap_c = PairedColorMap() """ - elif len(unique.unique(ind2_clust))==3: ### cmap_c is too few colors - cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) - #cmap_c = matplotlib.colors.ListedColormap(['r', 'y', 'b']) elif len(unique.unique(ind2_clust))==4: ### cmap_c is too few colors cmap_c = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C', '#FEBC18']) #cmap_c = matplotlib.colors.ListedColormap(['black', '#1DA532', 'b','r']) @@ -1084,13 +1086,14 @@ def onpick1(event): if len(unique.unique(ind2))==2: ### cmap_c is too few colors #cmap_d = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF']) cmap_d = matplotlib.colors.ListedColormap(['w', 'k']) + elif len(unique.unique(ind2))==3: ### cmap_c is too few colors + cmap_d = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) + cmap_d = matplotlib.colors.ListedColormap(['b', 'y', 'r']) + #cmap_d = matplotlib.colors.ListedColormap(['b', 'y', 'r']) elif len(unique.unique(ind2))>0: ### cmap_c is too few colors #cmap_d = pylab.cm.Paired cmap_d = PairedColorMap() """ - elif len(unique.unique(ind2))==3: ### cmap_c is too few colors - cmap_d = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) - #cmap_d = matplotlib.colors.ListedColormap(['r', 'y', 'b']) elif len(unique.unique(ind2))==4: ### cmap_c is too few colors cmap_d = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C', '#FEBC18']) elif len(unique.unique(ind2))==5: ### cmap_c is too few colors @@ -1512,6 +1515,8 @@ def systemCodeCheck(IDs): import gene_associations id_type_db={} for id in IDs: + if ':' in id: + id = string.split(id,':')[1] id_type = gene_associations.predictIDSourceSimple(id) try: id_type_db[id_type]+=1 except Exception: id_type_db[id_type]=1 @@ -1585,6 +1590,7 @@ def exportFlatClusterData(filename, root_dir, dataset_name, new_row_header,new_c else: sy = 'Sy' except Exception: pass + try: cluster_db[cluster].append(id) except Exception: cluster_db[cluster] = [id] try: export_lines.append(string.join([original_id,str(ind1[i])]+map(str, row),'\t')+'\n') @@ -2741,12 +2747,13 @@ def tSNE(matrix, column_header,dataset_name,group_db,display=True,showLabels=Fal if numberGenesPresent==2: cm = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF']) #cm = matplotlib.colors.ListedColormap(['w', 'k']) ### If you want to hide one of the groups + elif numberGenesPresent==3: + cm = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) + cm = matplotlib.colors.ListedColormap(['b', 'y', 'r']) else: - cm = pylab.cm.get_cmap('gist_rainbow') + #cm = pylab.cm.get_cmap('gist_rainbow') cm = pylab.cm.get_cmap('Paired') """ - elif numberGenesPresent==3: - cm = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) elif numberGenesPresent==4: cm = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C', '#FEBC18']) elif numberGenesPresent==5: @@ -2792,7 +2799,7 @@ def get_cmap(n, name='hsv'): colors = get_cmap(len(genes)) for i in range(len(ranges)): if i==0: - color = '#C0C0C0' + color = '#F1F1F1' else: if numberGenesPresent==1: ### use a single color gradient @@ -2805,7 +2812,7 @@ def get_cmap(n, name='hsv'): else: color = colors(k) else: - color = '#C0C0C0' + color = '#F1F1F1' color_db[ranges[i]] = color i=0 for val in values: @@ -3196,7 +3203,7 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, ### transform u into the same structure as the original scores from linalg.eig coeff scores = vt - writetSNEScores(scores,root_dir+dataset_name+'-PCA_scores.txt') + #writetSNEScores(scores,root_dir+dataset_name+'-PCA_scores.txt') fig = pylab.figure() ax = fig.add_subplot(111) @@ -3261,12 +3268,12 @@ def PrincipalComponentAnalysis(matrix, column_header, row_header, dataset_name, if numberGenesPresent==2: cm = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF']) #cm = matplotlib.colors.ListedColormap(['w', 'k']) ### If you want to hide one of the groups + elif numberGenesPresent==3: + cm = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) else: - cm = pylab.cm.get_cmap('gist_rainbow') + #cm = pylab.cm.get_cmap('gist_rainbow') cm = pylab.cm.get_cmap('Paired') """ - elif numberGenesPresent==3: - cm = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) elif numberGenesPresent==4: cm = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C', '#FEBC18']) elif numberGenesPresent==5: @@ -9015,8 +9022,72 @@ def computeIsoformRatio(gene_exp_file, isoform_exp_file): #max_ratios = max(map(float,ratios)) eo.close() + +def TFisoToGene(filename,marker_genes): + + firstRow=True + gene_db={} + for line in open(marker_genes, 'rU').xreadlines(): + data = cleanUpLine(line) + t = string.split(data, '\t') + gene = t[0] + tissue = t[-1] + if firstRow: + firstRow=False + else: + try: gene_db[gene].append(tissue) + except: gene_db[gene] = [tissue] + + firstRow=True + interaction_TF_db={} + tf_iso_db={} + for line in open(filename, 'rU').xreadlines(): + data = cleanUpLine(line) + TFiso,gene,interaction = string.split(data, '\t') + if firstRow: + firstRow=False + else: + try: tf_iso_db[gene].append(TFiso) + except: tf_iso_db[gene] = [TFiso] + if gene in interaction_TF_db: + db = interaction_TF_db[gene] + try: db[interaction].append(TFiso) + except: db[interaction] = [TFiso] + else: + db = {} + db[interaction] = [TFiso] + interaction_TF_db[gene] = db + for gene in interaction_TF_db: + eo = export.ExportFile(filename[:-4]+'/'+gene+'.txt') + iso_db = interaction_TF_db[gene] + isos=[] + for tf_iso in tf_iso_db[gene]: + if tf_iso not in isos: + isos.append(tf_iso) + eo.write(string.join(['Partner']+isos,'\t')+'\n') + for interaction in iso_db: + values=[] + for tf_iso in isos: + if tf_iso in iso_db[interaction]: + values.append('1') + else: + values.append('0') + if interaction in gene_db: + name = interaction+'--'+gene_db[interaction][0] + else: + name = interaction + eo.write(string.join([name]+values,'\t')+'\n') + eo.close() + if __name__ == '__main__': + b = '/Volumes/salomonis2/Immune-10x-data-Human-Atlas/Bone-Marrow/Stuart/Browser/ExpressionInput/HS-compatible_symbols.txt' + b = '/data/salomonis2/GSE107727_RAW-10X-Mm/filtered-counts/ExpressionInput/Mm_compatible_symbols.txt' + input_file = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-10x/Scadden-90k/exp.BoneMarrow-90k-filtered.txt' + #convertSymbolLog(input_file,b,species='Mm',logNormalize=False); sys.exit() + + marker_genes = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Isoform-U01/6k-Genecode30/GTEx/Gene-level/ExpressionInput/Genes-MarkerFinder.txt' + #TFisoToGene('/Users/saljh8/Downloads/clones.txt',marker_genes);sys.exit() #displaySimpleNetworkX();sys.exit() input_file = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Autism/PRJNA434002/ICGS-NMF/CellFrequencies/FinalGroups-CellTypesFull-Author.txt' #summarizeCovariates(input_file);sys.exit() @@ -9027,7 +9098,11 @@ def computeIsoformRatio(gene_exp_file, isoform_exp_file): isoform_exp = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Isoform-U01/6k-Genecode30/protein.Gtex-GC30_6k-selected-tissues-TFs.txt' gene_exp = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Isoform-U01/6k-Genecode30/gene.Gtex-GC30_6k-selected-tissues-TFs.txt' - computeIsoformRatio(gene_exp,isoform_exp);sys.exit() + + isoform_exp = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Isoform-U01/6k-Genecode30/TCGA-BRCA/protein.BreastCancer-GC30-6K.txt' + gene_exp = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Isoform-U01/6k-Genecode30/TCGA-BRCA/gene.BreastCancer-GC30-6K.txt' + + #computeIsoformRatio(gene_exp,isoform_exp);sys.exit() #aggregateMarkerFinderResults('/Volumes/salomonis2/LabFiles/TabulaMuris/Smart-Seq2_Nextera/CPTT-Files/all-comprehensive/');sys.exit() groups_file = '/data/salomonis2/LabFiles/TabulaMuris/Smart-Seq2_Nextera/CPTT-Files/all-comprehensive/FACS_annotation-edit.txt' exp_dir = '/data/salomonis2/LabFiles/TabulaMuris/Smart-Seq2_Nextera/CPTT-Files/all-comprehensive/MergedFiles.txt' @@ -9107,14 +9182,10 @@ def computeIsoformRatio(gene_exp_file, isoform_exp_file): #outputForGOElite('/Users/saljh8/Desktop/R412X/completed/centroids.WT.R412X.median.txt');sys.exit() #simpleStatsSummary('/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/HCA/Mean-Comparisons/ExpressionInput/MergedFiles.Counts.UMI.txt');sys.exit() a = '/Users/saljh8/Downloads/groups.CellTypes-Predicted-Label-Transfer-For-Nuclei-matrix.txt' - b = '/Volumes/salomonis2/Immune-10x-data-Human-Atlas/Bone-Marrow/Stuart/Browser/ExpressionInput/HS-compatible_symbols.txt' - b = '/data/salomonis2/GSE107727_RAW-10X-Mm/filtered-counts/ExpressionInput/Mm_compatible_symbols.txt' - input_file = '/Users/saljh8/Downloads/exprMatrix.txt' ##transposeMatrix(a);sys.exit() - convertSymbolLog(input_file,b,species='Hs',logNormalize=False); sys.exit() #returnIntronJunctionRatio('/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Fluidigm_scRNA-Seq/12.09.2107/counts.WT-R412X.txt');sys.exit() #geneExpressionSummary('/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Ly6g/combined-ICGS-Final/ExpressionInput/DEGs-LogFold_1.0_rawp');sys.exit() - b = '/Users/saljh8/Dropbox/scRNA-Seq Markers/Human/Expression/Lung/Adult/PRJEB31843/1k-T0-cellHarmony-groups2.txt' + b = '/Users/saljh8/Downloads/Correlation_files_BRCA/ICGS-NMF/groups.FinalMarkerHeatmap_all.txt' a = '/Users/saljh8/Dropbox/scRNA-Seq Markers/Human/Expression/Lung/Adult/Perl-CCHMC/FinalMarkerHeatmap_all.txt' convertGroupsToBinaryMatrix(b,b,cellHarmony=False);sys.exit() a = '/Users/saljh8/Desktop/temp/groups.TNBC.txt'