diff --git a/Config/options.txt b/Config/options.txt index 77b1ef6..16160f1 100755 --- a/Config/options.txt +++ b/Config/options.txt @@ -20,7 +20,7 @@ dataset_name Give a name to this dataset enter InputCELFiles --- --- --- --- input_cel_dir Select the CEL file containing folder folder InputCELFiles --- --- --- --- --- --- --- input_fastq_dir (optional) Select fastq files to run in Kallisto folder InputCELFiles NA NA NA NA NA --- NA output_CEL_dir Select an AltAnalyze result output directory folder InputCELFiles --- --- --- --- --- --- --- -multithreading Use multithreading for read genomic annotation comboBox InputCELFiles yes NA NA NA NA NA yes|no NA +multithreading Use multithreading for read genomic annotation comboBox InputCELFiles no NA NA NA NA NA yes|no NA build_exon_bedfile Build exon coordinate bed file to obtain BAM file exon counts\k(see the online tutorial for additional details and information) single-checkbox InputCELFiles NA NA NA NA NA --- NA channel_to_extract Extract data from the following channels comboBox InputCELFiles NA NA NA green|red|green/red ratio|red/green ratio NA NA NA remove_xhyb Remove probesets that have large cross-hybridization scores single-checkbox InputCELFiles --- --- NA NA NA NA NA @@ -139,10 +139,16 @@ modelDiscovery (LineageProflier) Iterative model discovery comboBox LineageProfi input_data_file Select the tab-delimited expression file for ID Translation file IDConverter --- --- --- --- --- --- --- input_source Select the file ID system (first column) comboBox IDConverter --- --- --- --- --- --- --- output_source Select the ID system to add to this file comboBox IDConverter --- --- --- --- --- --- --- -input_file1 Select the first file to merge file MergeFiles --- --- --- --- --- --- --- -input_file2 Select the second file to merge file MergeFiles --- --- --- --- --- --- --- -input_file3 Select the third file to merge (optional) file MergeFiles --- --- --- --- --- --- --- -input_file4 Select the fourth file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file1 Select the 1st file to merge file MergeFiles --- --- --- --- --- --- --- +input_file2 Select the 2nd file to merge file MergeFiles --- --- --- --- --- --- --- +input_file3 Select the 3rd file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file4 Select the 4th file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file5 Select the 5th file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file6 Select the 6th file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file7 Select the 7th file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file8 Select the 8th file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file9 Select the 9th file to merge (optional) file MergeFiles --- --- --- --- --- --- --- +input_file10 Select the 10th file to merge (optional) file MergeFiles --- --- --- --- --- --- --- join_option Join files based on their comboBox MergeFiles Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union Intersection|Union ID_option Only return one-to-one ID relationships comboBox MergeFiles False|True False|True False|True False|True False|True False|True False|True output_merge_dir Select the folder to save the merged file folder MergeFiles --- --- --- --- --- --- --- @@ -175,12 +181,13 @@ FoldDiff Fold change filter cutoff enter PredictGroups 4 --- --- --- --- --- - rho_cutoff Minimum Pearson correlation cutoff enter PredictGroups 0.2 --- --- --- --- --- --- --- SamplesDiffering Minimum number of cells differing enter PredictGroups 4 --- --- --- --- --- --- --- dynamicCorrelation ICGS will identify an optimal correlation cutoff comboBox PredictGroups yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no -removeOutliers Remove low expression outlier samples comboBox PredictGroups yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no +removeOutliers Remove low expression outlier cells comboBox PredictGroups yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no featuresToEvaluate Features to evaluate comboBox PredictGroups Genes Genes|AltExon|Both Genes|AltExon|Both Genes|AltExon|Both Genes Genes|AltExon|Both Genes|AltExon|Both Genes restrictBy Restrict genes to protein coding comboBox PredictGroups yes yes|no yes|no yes|no yes|no yes|no yes|no yes|no column_metric_predict Select the column clustering metric comboBox PredictGroups http://docs.scipy.org/doc/scipy/reference/spatial.distance.html cosine braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule braycurtis|canberra|chebyshev|cityblock|correlation|cosine|dice|euclidean|hamming|jaccard|kulsinski|mahalanobis|matching|minkowski|rogerstanimoto|russellrao|seuclidean|sokalmichener|sokalsneath|sqeuclidean|yule column_method_predict Select the column clustering method comboBox PredictGroups http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach average|single|complete|weighted|ward|hopach k (optional) number of user-defined ICGS clusters (k) enter PredictGroups --- --- --- --- --- --- --- +downsample (optional) Cells to down-sample to (PageRank) enter PredictGroups 2500 --- --- --- --- --- --- --- GeneSelectionPredict (optional) Enter genes to build clusters from (guides) enter PredictGroups --- --- --- --- --- --- --- GeneSetSelectionPredict (optional) Or select guide GeneSet/Ontology comboBox PredictGroups PathwaySelectionPredict (optional) Select guide specific GeneSet(s) multiple-comboBox PredictGroups diff --git a/GO_Elite.py b/GO_Elite.py index fcbb4ee..01c7b55 100644 --- a/GO_Elite.py +++ b/GO_Elite.py @@ -229,12 +229,14 @@ def moveMAPPFinderFiles(input_dir): #try: UI.WarningWindow(print_out,' Exit ') #except Exception: print print_out proceed = 'no' - while proceed == 'no': - try: os.remove(fn); proceed = 'yes' - except Exception: - print 'Tried to move the file',mappfinder_input,'to an archived folder, but it is currently open.' - print 'Please close this file and hit return or quit GO-Elite' - inp = sys.stdin.readline() + + #while proceed == 'no': + try: os.remove(fn); proceed = 'yes' + except Exception: + pass + #print 'Tried to move the file',mappfinder_input,'to an archived folder, but it is currently open.' + #print 'Please close this file and hit return or quit GO-Elite' + #inp = sys.stdin.readline() def checkPathwayType(filename): type='GeneSet' @@ -1982,7 +1984,7 @@ def visualizePathways(species_code,oraDirTogeneDir,combined_results): except Exception: pass wp_end_time = time.time(); time_diff = int(wp_end_time-wp_start_time) - print "Wikipathways output in %d seconds" % time_diff + print "Gene set results output in %d seconds" % time_diff def makeVariableList(wp_to_visualize,species_code,mod,imageType): variable_list=[] diff --git a/LineageProfilerIterate.py b/LineageProfilerIterate.py index ed7498d..3c14b2c 100755 --- a/LineageProfilerIterate.py +++ b/LineageProfilerIterate.py @@ -2645,6 +2645,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl output_dir = root_dir+'/exp.'+string.replace(output_dir,'exp.','') output_dir =string.replace(output_dir,'-OutliersRemoved','') groups_dir = string.replace(output_dir,'exp.','groups.') + ref_exp_db,ref_headers,ref_col_clusters,cluster_format_reference = importExpressionFile(reference_exp_file,customLabels=customLabels) cluster_results = clustering.remoteImportData(query_exp_file,geneFilter=ref_exp_db) if len(cluster_results[0])>0: filterIDs = ref_exp_db @@ -3587,6 +3588,7 @@ def importExpressionFile(input_file,ignoreClusters=False,filterIDs=False,customL gene_to_symbol={} symbol_to_gene={} row_count=0 + for line in open(input_file,'rU').xreadlines(): data = line.rstrip() data = string.replace(data,'"','') @@ -4290,6 +4292,7 @@ def convertICGSClustersToExpression(heatmap_file,query_exp_file,returnCentroids= matrix_exp, column_header_exp, row_header_exp, dataset_name, group_db_exp = clustering.importData(expdir,geneFilter=row_header) percent_found = (len(row_header_exp)*1.00)/len(row_header) + if percent_found<0.5: print "...Incompatible primary ID (Symbol), converting to Ensembl" import gene_associations; from import_scripts import OBO_import diff --git a/UI.py b/UI.py index 8310eb5..01b41ab 100755 --- a/UI.py +++ b/UI.py @@ -1090,6 +1090,9 @@ def runLineageProfiler(fl, expr_input_dir, vendor, custom_markerFinder, geneMode except: cellLabels = False if cellLabels == '': cellLabels = None + if '.h5' in expr_input_dir or '.mtx' in expr_input_dir: + from import_scripts import ChromiumProcessing + expr_input_dir = ChromiumProcessing.import10XSparseMatrix(expr_input_dir,species,'cellHarmony-Query') try: LineageProfilerIterate.runLineageProfiler(species,platform,expr_input_dir,expr_input_dir, codingtype,compendium_platform,customMarkers=custom_markerFinder, geneModels=geneModel,modelSize=modelSize,fl=fl,label_file=cellLabels) @@ -3302,11 +3305,13 @@ def getOnlineEliteDatabase(file_location_defaults,db_version,new_species_codes,u dbs_added = 0 AltAnalyze_folders = read_directory(''); Cytoscape_found = 'no' + """ for dir in AltAnalyze_folders: if 'Cytoscape_' in dir: Cytoscape_found='yes' if Cytoscape_found == 'no': fln,status = update.download(goelite_url+'Cytoscape/cytoscape.tar.gz','','') if 'Internet' not in status: print "Cytoscape program folder downloaded." + """ count = verifyFileLength('AltDatabase/TreeView/TreeView.jar') if count==0: @@ -5415,6 +5420,12 @@ def rebootAltAnalyzeGUI(selected_parameters,user_variables): input_file2 = gu.Results()['input_file2'] input_file3 = gu.Results()['input_file3'] input_file4 = gu.Results()['input_file4'] + input_file1 = gu.Results()['input_file5'] + input_file2 = gu.Results()['input_file6'] + input_file3 = gu.Results()['input_file7'] + input_file4 = gu.Results()['input_file8'] + input_file3 = gu.Results()['input_file9'] + input_file4 = gu.Results()['input_file10'] join_option = gu.Results()['join_option'] ID_option = gu.Results()['ID_option'] output_merge_dir = gu.Results()['output_merge_dir'] @@ -5426,6 +5437,12 @@ def rebootAltAnalyzeGUI(selected_parameters,user_variables): files_to_merge = [input_file1, input_file2] if len(input_file3)>0: files_to_merge.append(input_file3) if len(input_file4)>0: files_to_merge.append(input_file4) + if len(input_file5)>0: files_to_merge.append(input_file5) + if len(input_file6)>0: files_to_merge.append(input_file6) + if len(input_file7)>0: files_to_merge.append(input_file7) + if len(input_file8)>0: files_to_merge.append(input_file8) + if len(input_file7)>0: files_to_merge.append(input_file9) + if len(input_file8)>0: files_to_merge.append(input_file10) values = files_to_merge, join_option, ID_option, output_merge_dir StatusWindow(values,analysis) ### display an window with download status AltAnalyze.AltAnalyzeSetup((selected_parameters[:-1],user_variables)); sys.exit() diff --git a/import_scripts/mergeFiles.py b/import_scripts/mergeFiles.py index 5b88f01..92988b0 100755 --- a/import_scripts/mergeFiles.py +++ b/import_scripts/mergeFiles.py @@ -55,9 +55,143 @@ def cleanUpLine(line): data = string.replace(data,'"','') return data +def latteralMerge(files_to_merge,original_filename): + """ Merging files can be dangerous, if there are duplicate IDs (e.g., gene symbols). + To overcome issues in redundant gene IDs that are improperly matched (one row with zeros + and the other with values), this function determines if a lateral merge is more appropriate. + The latter merge: + 1) Checks to see if the IDs are the same with the same order between the two or more datasets + 2) merges the two or more matrices without looking at the genes. + + Note: This function is attempts to be memory efficient and should be updated in the future to + merge blocks of row IDs sequentially.""" + + files_to_merge_revised = [] + for filename in files_to_merge: + ### If a sparse matrix - rename and convert to flat file + if '.h5' in filename or '.mtx' in filename: + from import_scripts import ChromiumProcessing + import export + file = export.findFilename(filename) + export_name = file[:-4]+'-filt' + if file == 'filtered_feature_bc_matrix.h5' or file == 'raw_feature_bc_matrix.h5' or 'filtered_gene_bc_matrix.h5' or file == 'raw_gene_bc_matrix.h5': + export_name = export.findParentDir(filename) + export_name = export.findFilename(export_name[:-1]) + if file == 'matrix.mtx.gz' or file == 'matrix.mtx': + parent = export.findParentDir(filename) + export_name = export.findParentDir(parent) + export_name = export.findFilename(export_name[:-1]) + filename = ChromiumProcessing.import10XSparseMatrix(filename,'species',export_name) + files_to_merge_revised.append(filename) + files_to_merge = files_to_merge_revised + print files_to_merge + + includeFilenames = True + file_uids = {} + for filename in files_to_merge: + firstRow=True + fn=filepath(filename); x=0 + if '/' in filename: + file = string.split(filename,'/')[-1][:-4] + else: + file = string.split(filename,'\\')[-1][:-4] + for line in open(fn,'rU').xreadlines(): + data = cleanUpLine(line) + if '\t' in data: + t = string.split(data,'\t') + elif ',' in data: + t = string.split(data,',') + else: + t = string.split(data,'\t') + if firstRow: + firstRow = False + else: + uid = t[0] + try: + file_uids[file].append(uid) + except: + file_uids[file] = [uid] + + perfectMatch = True + for file1 in file_uids: + uids1 = file_uids[file1] + for file2 in file_uids: + uids2 = file_uids[file2] + if uids1 != uids2: + print file1,file2 + perfectMatch = False + + if perfectMatch: + print 'All ordered IDs match in the files ... performing latteral merge instead of key ID merge to prevent multi-matches...' + firstRow=True + increment = 5000 + low = 1 + high = 5000 + added = 1 + eo = open(output_dir+'/MergedFiles.txt','w') + import collections + + def exportMergedRows(low,high): + uid_values=collections.OrderedDict() + for filename in files_to_merge: + fn=filepath(filename); x=0; file_uids = {} + if '/' in filename: + file = string.split(filename,'/')[-1][:-4] + else: + file = string.split(filename,'\\')[-1][:-4] + firstRow=True + row_count = 0 + uids=[] ### Over-ride this for each file + for line in open(fn,'rU').xreadlines(): + row_count+=1 + if row_count<=high and row_count>=low: + data = cleanUpLine(line) + if '\t' in data: + t = string.split(data,'\t') + elif ',' in data: + t = string.split(data,',') + else: + t = string.split(data,'\t') + if firstRow and low==1: + file = string.replace(file,'_matrix_CPTT','') + if includeFilenames: + header = [s + "."+file for s in t[1:]] ### add filename suffix + else: + header = t[1:] + try: uid_values[row_count]+=header + except: uid_values[row_count]=header + uids.append('UID') + firstRow=False + else: + uid = t[0] + try: uid_values[row_count] += t[1:] + except: uid_values[row_count] = t[1:] + uids.append(uid) + i=0 + for index in uid_values: + uid = uids[i] + eo.write(string.join([uid]+uid_values[index],'\t')+'\n') + i+=1 + print 'completed',low,high + + uid_list = file_uids[file] + while (len(uid_list)+increment)>high: + exportMergedRows(low,high) + high+=increment + low+=increment + eo.close() + return True + else: + print 'Different identifier order in the input files encountered...' + return False + def combineAllLists(files_to_merge,original_filename,includeColumns=False): headers =[]; files=[] + run = latteralMerge(files_to_merge,original_filename) + if run: + return ### Exit Merge Function + import collections all_keys=collections.OrderedDict() dataset_data=collections.OrderedDict() diff --git a/import_scripts/multiBAMtoBED.py b/import_scripts/multiBAMtoBED.py index bc15184..900c1c4 100755 --- a/import_scripts/multiBAMtoBED.py +++ b/import_scripts/multiBAMtoBED.py @@ -186,10 +186,10 @@ def runBAMtoJunctionBED(paths_to_run): def runBAMtoExonBED(paths_to_run): bamfile_dir,refExonCoordinateFile,bed_reference_dir,output_bedfile_path = paths_to_run if os.path.exists(output_bedfile_path) == False: ### Only run if the file doesn't exist - BAMtoExonBED.parseExonReferences(bamfile_dir,bed_reference_dir,multi=True,intronRetentionOnly=False) + BAMtoExonBED.parseExonReferences(bamfile_dir,bed_reference_dir,multi=True,intronRetentionOnly=True) else: print output_bedfile_path, 'already exists... re-writing' - BAMtoExonBED.parseExonReferences(bamfile_dir,bed_reference_dir,multi=True,intronRetentionOnly=False) + BAMtoExonBED.parseExonReferences(bamfile_dir,bed_reference_dir,multi=True,intronRetentionOnly=True) def getChrFormat(directory): ### Determine if the chromosomes have 'chr' or nothing diff --git a/mappfinder.py b/mappfinder.py index e9ff5eb..7fc7741 100755 --- a/mappfinder.py +++ b/mappfinder.py @@ -955,7 +955,9 @@ def calculateZScores(pathway_input_gene_count,pathway_denominator_gene_count,N,R #if '06878' in pathway: print pathway, z, null_z, r,n, N, R;kill if use_FET == 'yes': ### Alternatively calculate p using the Fisher's Exact Test - p = FishersExactTest(r,n,R,N) + try: p = FishersExactTest(r,n,R,N) + except: + print r, n, R, N, pathway; forceIncompleteDenominator ### add missing genes to the Genes file!!! zsd.SetP(p) def Zscore(r,n,N,R): diff --git a/stats_scripts/ICGS_NMF.py b/stats_scripts/ICGS_NMF.py index 49465f3..3df921d 100644 --- a/stats_scripts/ICGS_NMF.py +++ b/stats_scripts/ICGS_NMF.py @@ -757,7 +757,7 @@ def DetermineClusterFitness(allgenesfile,markerfile,filterfile,BinarizedOutput,r upd_guides.append(i) countr+=1 counter+=1 - + print len(group), len(upd_guides),name return upd_guides,name,group def sortFile(allgenesfile,rho_cutoff,name): @@ -1034,7 +1034,7 @@ def CompleteICGSWorkflow(root_dir,processedInputExpFile,EventAnnot,iteration,rho Guidefile=graphic_links3[-1][-1] Guidefile=Guidefile[:-4]+'.txt' else: - Guidefile="/Users/saljh8/Dropbox/Collaborations/KPMP-Eric/3e284c30-51a6-480f-a248-0497b9863486_expression_matrix/ICGS/Clustering-exp.3e284c30-PageRank-downsampled-OutliersRemoved-Guide3-hierarchical_cosine_correlation.txt" + Guidefile="/Users/saljh8/Downloads/Clustering-exp.type2b_cell_to_cluster-VarGenes-Guide3-hierarchical_cosine_correlation.txt" rho_cutoff=0.2 try: @@ -1491,7 +1491,7 @@ def runICGS_NMF(inputExpFile,scaling,platform,species,gsp,enrichmentInput='',dyn else: processedInputExpFile = inputExpFile else: ### Re-run using a prior produced ICGS2 result - processedInputExpFile = '/Users/saljh8/Dropbox/Collaborations/KPMP-Eric/3e284c30-51a6-480f-a248-0497b9863486_expression_matrix/ExpressionInput/exp.3e284c30-PageRank-downsampled.txt' + processedInputExpFile = '/Users/saljh8/Downloads/Type2b/ExpressionInput/exp.type2b_cell_to_cluster-VarGenes.txt' flag=True iteration=1 ### Always equal to 1 for scRNA-Seq but can increment for splice-ICGS diff --git a/stats_scripts/statistics.py b/stats_scripts/statistics.py index a5aeaa3..b368e14 100755 --- a/stats_scripts/statistics.py +++ b/stats_scripts/statistics.py @@ -73,6 +73,45 @@ def LinearRegression(ls1,ls2,return_rsqrd): else: return slope +def BenjaminiHochberg(filename): + #1. Sort ascending the original input p value vector. Call this spval. Keep the original indecies so you can sort back. + #2. Define a new vector called tmp. tmp= spval. tmp will contain the BH p values. + #3. m is the length of tmp (also spval) + #4. i=m-1 + #5 tmp[ i ]=min(tmp[i+1], min((m/i)*spval[ i ],1)) - second to last, last, last/second to last + #6. i=m-2 + #7 tmp[ i ]=min(tmp[i+1], min((m/i)*spval[ i ],1)) + #8 repeat step 7 for m-3, m-4,... until i=1 + #9. sort tmp back to the original order of the input p values. + + pval_db = {} + firstRow=True + for line in open(filename,'rU').xreadlines(): + if firstRow: + firstRow=False + else: + data = line.rstrip() + uid,p = string.split(data,'\t') + pval_db[uid] = float(p) + + global spval; spval=[] + for element in pval_db: + p = pval_db[element] + spval.append([p,element]) + + spval.sort(); tmp = spval; m = len(spval); i=m-2; x=0 ###Step 1-4 + #spval.sort(); tmp = spval; m = len(spval)-1; i=m-1; x=0 ###Step 1-4 + + while i > -1: + tmp[i]=min(tmp[i+1][0], min((float(m)/(i+1))*spval[i][0],1)),tmp[i][1]; i -= 1 + + eo = open(filename[:-4]+'_BH.txt','w') + eo.write('UID\trawp\tBH-p\n') + for (adjp,element) in tmp: + p = pval_db[element] + eo.write(element+'\t'+str(p)+'\t'+str(adjp)+'\n') + eo.close() + def adjustPermuteStats(pval_db): #1. Sort ascending the original input p value vector. Call this spval. Keep the original indecies so you can sort back. #2. Define a new vector called tmp. tmp= spval. tmp will contain the BH p values. @@ -1155,7 +1194,8 @@ def returnANOVAFiltered(filename,original_data,matrix_pvalues): filename = '/Users/saljh8/Desktop/top_alt_junctions-clust-Grimes_relativePE.txt' filename = '/Volumes/SEQ-DATA/Jared/AltResults/AlternativeOutput/Hs_RNASeq_top_alt_junctions-PSI-clust.txt' filename = '/Users/saljh8/Desktop/dataAnalysis/Mm_Simulation_AltAnalyze/AltResults/AlternativeOutput/Mm_RNASeq_top_alt_junctions-PSI-clust.txt' - #filename = '/Volumes/salomonis2/Grimes/tophat SKI KO/bams/AltResults/AlternativeOutput/Mm_RNASeq_top_alt_junctions-PSI-clust.txt' + filename = '/Users/saljh8/Downloads/LRT.txt' + BenjaminiHochberg(filename);sys.exit() matrix,compared_groups,original_data = matrixImport(filename) matrix_pvalues=runANOVA(filename,matrix,compared_groups) returnANOVAFiltered(filename,original_data,matrix_pvalues); sys.exit() diff --git a/visualization_scripts/clustering.py b/visualization_scripts/clustering.py index cac432e..3e37173 100644 --- a/visualization_scripts/clustering.py +++ b/visualization_scripts/clustering.py @@ -3393,7 +3393,7 @@ def get_cmap(n, name='hsv'): color = 'r'; label=None try: ax.plot(scores[pcA][i],scores[1][i],color=color,marker='o',markersize=marker_size,label=label,markeredgewidth=0,picker=True) except Exception, e: print e; print i, len(scores[pcB]);kill - if showLabels: + if showLabels and len(column_header)<100: try: sample_name = ' '+string.split(sample_name,':')[1] except Exception: pass ax.text(scores[pcA][i],scores[pcB][i],sample_name,fontsize=11) @@ -3786,7 +3786,7 @@ def PCA3D(matrix, column_header, row_header, dataset_name, group_db, else: if numberGenesPresent==2: cm = matplotlib.colors.ListedColormap(['#00FF00', '#1E90FF']) - cm = matplotlib.colors.ListedColormap(['w', 'k']) + #cm = matplotlib.colors.ListedColormap(['w', 'k']) elif numberGenesPresent==3: cm = matplotlib.colors.ListedColormap(['#88BF47', '#3D3181', '#EE2C3C']) elif numberGenesPresent==4: @@ -3899,7 +3899,7 @@ def PCA3D(matrix, column_header, row_header, dataset_name, group_db, color = 'r'; label=None ax.plot([scores[0][i]],[scores[1][i]],[scores[2][i]],color=color,marker='o',markersize=markersize,label=label,markeredgewidth=0,picker=True) #markeredgecolor=color - if showLabels: + if showLabels and len(column_header)<100: #try: sample_name = ' '+string.split(sample_name,':')[1] #except Exception: pass ax.text(scores[0][i],scores[1][i],scores[2][i], ' '+sample_name,fontsize=9) @@ -5322,11 +5322,12 @@ def buildGraphFromSIF(mod,species,sif_filename,ora_input_dir): output_filename = iGraphSimple(sif_filename,fold_db,pathway_name) except Exception: print 'igraph export failed (not installed - or too large of a network)... Trying NetworkX.' - #print traceback.format_exc() - try: output_filename = displaySimpleNetworkX(sif_filename,original_fold_db,pathway_name) - except Exception: + if 'Elite' not in sif_filename: #print traceback.format_exc() - pass + try: output_filename = displaySimpleNetworkX(sif_filename,original_fold_db,pathway_name) + except Exception: + #print traceback.format_exc() + pass return output_filename def iGraphSimple(sif_filename,fold_db,pathway_name): @@ -7828,6 +7829,25 @@ def convertSymbolLog(input_file,ensembl_symbol,species=None,logNormalize=True): eo.close() +def Log2Only(input_file): + eo = export.ExportFile(input_file[:-4]+'-log2.txt') + header=0 + for line in open(input_file,'rU').xreadlines(): + data = cleanUpLine(line) + values = string.split(data,'\t') + gene = values[0] + if header == 0: + data = cleanUpLine(line) + headers = [] + values = string.split(data,'\t') + eo.write(string.join(headers,'\t')+'\n') + header+=1 + else: + values = map(lambda x: math.log(float(x)+1,2),values[1:]) + values = map(lambda x: str(x)[:5],values) + eo.write(string.join([gene]+values,'\t')+'\n') + eo.close() + def convertXenaBrowserIsoformDataToStandardRatios(input_file): eo = open(input_file[:-4]+'-log2.txt','w') header=0 @@ -8452,7 +8472,7 @@ def importCellHarmonyDEGs(folder, repair=False): fold = -1 / math.pow(2, float(LogFold)) if Symbol == 'S100a8': print 'S100a8', file, LogFold, fold - if abs(fold) > 1.0 and rawp < 0.05: + if abs(fold) > 0.0 and rawp < 0.05: try: DEG_db[Symbol].append([file, direction]) except: @@ -8467,20 +8487,20 @@ def importCellHarmonyDEGs(folder, repair=False): return DEG_db ref_DEGs = importCellHarmonyDEGs(reference_fold_dir) - repaired_DEGs = importCellHarmonyDEGs(repair_dir1, repair=True) - repaired2_DEGs = importCellHarmonyDEGs(repair_dir2, repair=True) + repaired_DEGs = importCellHarmonyDEGs(repair_dir1) + repaired2_DEGs = importCellHarmonyDEGs(repair_dir2) total_repaired_genes ={} for gene in ref_DEGs: if gene in repaired_DEGs: for (file1,direction1) in ref_DEGs[gene]: for (file2,direction2) in repaired_DEGs[gene]: - if direction1 == direction2: + if direction1 == direction2 and file1 == file2: try: total_repaired_genes[gene].append([direction1,file1,file2]) except: total_repaired_genes[gene] = [[direction1,file1,file2]] for gene in total_repaired_genes: print gene+'\t'+str(total_repaired_genes[gene]) - print len(total_repaired_genes);sys.exit() + sys.exit() def importCellHarmonyPseudoBulkFolds(filename): fold_db = {} header = True @@ -8506,14 +8526,18 @@ def importCellHarmonyPseudoBulkFolds(filename): repair2_verified = collections.OrderedDict() cluster_ordered_ref_db = collections.OrderedDict() header = True + eo1 = export.ExportFile(organized_diff_ref[:-4] + '-Filtered.txt') for line in open(organized_diff_ref, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if header: + eo1.write(line) ref_header = t header = False else: cluster, geneID = string.split(t[0], ':') + if geneID in total_repaired_genes: + eo1.write(line) cluster = string.split(cluster, '_')[0] if cluster[:2] == 'DM': cluster = 'global' @@ -8522,7 +8546,8 @@ def importCellHarmonyPseudoBulkFolds(filename): cluster_ordered_ref_db[cluster].append(geneID) except: cluster_ordered_ref_db[cluster] = [geneID] - + eo1.close() + sys.exit() repaired_verified = {} verified = {} for geneID, ref_cluster in ordered_ref_degs: @@ -9136,8 +9161,9 @@ def TFisoToGene(filename,marker_genes): if __name__ == '__main__': b = '/Volumes/salomonis2/Immune-10x-data-Human-Atlas/Bone-Marrow/Stuart/Browser/ExpressionInput/HS-compatible_symbols.txt' b = '/data/salomonis2/GSE107727_RAW-10X-Mm/filtered-counts/ExpressionInput/Mm_compatible_symbols.txt' - input_file = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-10x/Scadden-90k/ICGS-cH/exp.MF-cellHarmony.txt' - #convertSymbolLog(input_file,b,species='Mm',logNormalize=False); sys.exit() + input_file = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/LungMAP/Perl/exp.ExplantBronchiolitisObliterans.txt' + #Log2Only(input_file);sys.exit() + #convertSymbolLog(input_file,b,species='Hs',logNormalize=True); sys.exit() marker_genes = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Isoform-U01/6k-Genecode30/GTEx/Gene-level/ExpressionInput/Genes-MarkerFinder.txt' #TFisoToGene('/Users/saljh8/Downloads/clones.txt',marker_genes);sys.exit() @@ -9194,6 +9220,14 @@ def TFisoToGene(filename,marker_genes): reference_fold_dir = '/Users/saljh8/Dropbox/Collaborations/Jayati/Thpok/Merged-GFP-WT-RR44/cellHarmony_KO-vs-WT/DifferentialExpression_Fold_1.2_adjp_0.05' repair_dir1 = '/Users/saljh8/Dropbox/Collaborations/Jayati/Thpok/Fluidigm/GeneExpression/cellHarmony_ThPOK-vs-WT-Nature2020-Fluidigm-rawp/DifferentialExpression_Fold_1.2_rawp_0.05' repair_dir2 = '/Users/saljh8/Dropbox/Collaborations/Jayati/Thpok/Fluidigm/GeneExpression/cellHarmony_ThPOK-vs-WT-Nature2020-Fluidigm-rawp/DifferentialExpression_Fold_1.2_rawp_0.05' + + organized_diff_ref = '/Users/saljh8/Dropbox/Collaborations/Huppert/Joint-ICGS2-JAG-WTB/UnsupervisedAnalysis/ICGS-NMF_euclidean_cc/cellHarmony-exp2/OrganizedDifferentials.txt' + repair1_folds = '/Users/saljh8/Dropbox/Collaborations/Huppert/Joint-ICGS2-JAG-WTB/UnsupervisedAnalysis/ICGS-NMF_euclidean_cc/cellHarmony-exp1/OtherFiles/exp.WTB__JB-AllCells-folds.txt' + repair2_folds = '/Users/saljh8/Dropbox/Collaborations/Huppert/Joint-ICGS2-JAG-WTB/UnsupervisedAnalysis/ICGS-NMF_euclidean_cc/cellHarmony-exp1/OtherFiles/exp.WTB__JB-AllCells-folds.txt' + reference_fold_dir = '/Users/saljh8/Dropbox/Collaborations/Huppert/Joint-ICGS2-JAG-WTB/UnsupervisedAnalysis/ICGS-NMF_euclidean_cc/cellHarmony-exp2/DifferentialExpression_Fold_1.2_adjp_0.05' + repair_dir1 = '/Users/saljh8/Dropbox/Collaborations/Huppert/Joint-ICGS2-JAG-WTB/UnsupervisedAnalysis/ICGS-NMF_euclidean_cc/cellHarmony-exp1/DifferentialExpression_Fold_1.2_adjp_0.05' + repair_dir2 = '/Users/saljh8/Dropbox/Collaborations/Huppert/Joint-ICGS2-JAG-WTB/UnsupervisedAnalysis/ICGS-NMF_euclidean_cc/cellHarmony-exp1/DifferentialExpression_Fold_1.2_adjp_0.05' + print 'comparing cellHarmony outputs' #rankExpressionRescueFromCellHarmony(organized_diff_ref, repair1_folds, repair2_folds, reference_fold_dir, repair_dir1, repair_dir2);sys.exit() @@ -9229,10 +9263,10 @@ def TFisoToGene(filename,marker_genes): #PSIfilterAndImpute('/Volumes/salomonis2/LabFiles/krithika_circadian/GSE98965-Papio_Anubis/files/grp-files/Filtered-Psi-groups-files'); sys.exit() filename='/Users/saljh8/Desktop/DemoData/Venetoclax/D4/cellHarmony-rawp-stringent/gene_summary.txt' filename = '/Volumes/salomonis2/LabFiles/Nathan/10x-PBMC-CD34+/AML-p27-pre-post/pre/cellHarmony-latest/gene_summary-p27.txt' - filename = '/Users/saljh8/Dropbox/Collaborations/Jayati/Thpok/SoupX/cellHarmony/gene_summary2.txt' + filename = '/Users/saljh8/Dropbox/Collaborations/Huppert/Joint-ICGS2-JAG-WTB/ICGS-NMF_euclidean_cc/cellHarmony-exp2/cell-frequency-stats-avg.txt' index1=2;index2=3; x_axis='Number of Differentially Expressed Genes'; y_axis = 'Comparisons'; title='Hippocampus - Number of Differentially Expressed Genes' - #OutputFile = export.findParentDir(filename) - #OutputFile = export.findParentDir(OutputFile[:-1])+'/test.pdf' + OutputFile = export.findParentDir(filename) + OutputFile = export.findParentDir(OutputFile[:-1])+'/test.pdf' #exportTFcorrelations('/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/SuperPan/ExpressionInput/exp.Cdt1-2139-genes.txt','/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Marie.Dominique/TF-to-gene/228-tfs.txt',0.1);sys.exit() #stackedbarchart(filename,display=True,output=OutputFile);sys.exit() index1=2;index2=3; x_axis='Number of DEGs'; y_axis = 'Reference clusters'; title='cellHarmony Differentially Expressed Genes' @@ -9249,7 +9283,7 @@ def TFisoToGene(filename,marker_genes): ##transposeMatrix(a);sys.exit() #returnIntronJunctionRatio('/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Fluidigm_scRNA-Seq/12.09.2107/counts.WT-R412X.txt');sys.exit() #geneExpressionSummary('/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Ly6g/combined-ICGS-Final/ExpressionInput/DEGs-LogFold_1.0_rawp');sys.exit() - b = '/Users/saljh8/Dropbox/Collaborations/Isoform-U01/GTEX-30-sample/TCGA-BRCA/forICGS/ICGS-NMF-cosine/groups.PAM50-all.txt' + b = '/Volumes/salomonis2/LabFiles/Kairavee/PseudoBulk_SJIA_ICGS_0.2_Pearson_final/ICGS-NMF/FinalGroups-1-celltypes.txt' a = '/Users/saljh8/Dropbox/scRNA-Seq Markers/Human/Expression/Lung/Adult/Perl-CCHMC/FinalMarkerHeatmap_all.txt' convertGroupsToBinaryMatrix(b,b,cellHarmony=False);sys.exit() a = '/Users/saljh8/Desktop/temp/groups.TNBC.txt' @@ -9346,8 +9380,8 @@ def TFisoToGene(filename,marker_genes): gene_list_file = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Ly6g/combined-ICGS-Final/ExpressionInput/genes.txt' gene_list_file = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Ly6g/combined-ICGS-Final/R412X/genes.txt' gene_list_file = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/HCA/BM1-8_CD34+/ExpressionInput/MixedLinPrimingGenes.txt' - gene_list_file = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-10x/Klein-Camargo/TCF15/Exp1/Active-Inactive/ExpressionInput/genes.txt' - genesets = importGeneList(gene_list_file,n=25) + gene_list_file = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Naren/Combined/ICGS-NMF/cellHarmony-non-immune-10-cells/genes.txt' + genesets = importGeneList(gene_list_file,n=29) filename = '/Users/saljh8/Desktop/Grimes/KashishNormalization/3-25-2015/comb-plots/exp.IG2_GG1-extended-output.txt' filename = '/Users/saljh8/Desktop/Grimes/KashishNormalization/3-25-2015/comb-plots/genes.tpm_tracking-ordered.txt' filename = '/Users/saljh8/Desktop/demo/Amit/ExpressedCells/GO-Elite_results/3k_selected_LineageGenes-CombPlotInput2.txt' @@ -9363,7 +9397,7 @@ def TFisoToGene(filename,marker_genes): filename = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/10X-DropSeq-comparison/DropSeq/MultiLinDetect/ExpressionInput/DataPlots/exp.DropSeq-2k-log2.txt' filename = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Ly6g/combined-ICGS-Final/R412X/exp.allcells-v2.txt' filename = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/HCA/BM1-8_CD34+/ExpressionInput/exp.CD34+.v5-log2.txt' - filename = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-10x/Klein-Camargo/TCF15/Exp1/Meg-Multi/ExpressionInput/exp.TCF15_S1_Raw.txt' + filename = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Naren/Combined/ICGS-NMF/cellHarmony-non-immune-10-cells/exp.MET__METdF-AllCells-dF.txt' #filename = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-10x/CITE-Seq-MF-indexed/ExpressionInput/exp.cellHarmony.v3.txt' #filename = '/Volumes/salomonis2/Theodosia-Kalfa/Combined-10X-CPTT/ExpressionInput/exp.MergedFiles-ICGS.txt' #filename = '/Users/saljh8/Desktop/dataAnalysis/Collaborative/Grimes/All-Fluidigm/updated.8.29.17/Ly6g/combined-ICGS-Final/R412X/exp.cellHarmony-WT-R412X-relative.txt' @@ -9373,7 +9407,7 @@ def TFisoToGene(filename,marker_genes): print genesets for gene_list in genesets: - multipleSubPlots(filename,gene_list,SubPlotType='column',n=25) + multipleSubPlots(filename,gene_list,SubPlotType='column',n=29) sys.exit() plotHistogram(filename);sys.exit()