6/13/2020

-Fixed recently introduced cellHarmony louvain errors. -Fixed averaging error in sampleIndexSelection with missing values -Added increased flexibiity to the preAligned labels format
nsalomonis · Jun 13, 2020 · 1112f32 · 1112f32
1 parent ef39e2d
commit 1112f32
Show file tree

Hide file tree

Showing 16 changed files with 314 additions and 56 deletions.
diff --git a/AltAnalyze.py b/AltAnalyze.py
@@ -5114,7 +5114,7 @@ def AltAnalyzeSetup(skip_intro):
     if 'remoteViewer' == skip_intro:
         if os.name == 'nt':
             callWXPython()
-        elif os.name == 'ntX':
+        elif os.name == 'nt':
             package_path = filepath('python')
             win_package_path = string.replace(package_path,'python','AltAnalyzeViewer.exe')
             import subprocess
@@ -8597,7 +8597,20 @@ def unpackConfigFiles():
     are written to the user home directory in the folder 'altanalyze'."""
 
     fn = filepath('Config/options.txt') ### See if a Config folder is already available
+    print fn
     fileExists = os.path.isfile(fn)
+    print 'Options.txt found in local Config = ',fileExists,
+
+    try:
+        for line in open(fn,'r').readlines():
+            break
+        print '...confirmed found'
+    except:
+        print '...confirmed not found'
+
+    if 'AltAnalyze.app' in os.getcwd(): ### Overcomes potential problems with the above
+        fileExists = True
+
     if fileExists == False:
         import subprocess
         import shutil

diff --git a/AltDatabase/kallisto/0.43.1-splice/Mac/bin/kallisto b/AltDatabase/kallisto/0.43.1-splice/Mac/bin/kallisto
diff --git a/Config/defaults-expr.txt b/Config/defaults-expr.txt
@@ -1 +1 @@
-array_type	dabg_p	rpkm_threshold	gene_exp_threshold	exon_exp_threshold	exon_rpkm_threshold	expression_threshold	perform_alt_analysis	analyze_as_groups	expression_data_format	normalize_feature_exp	normalize_gene_data	avg_all_for_ss	include_raw_data	probability_algorithm	FDR_statistic	batch_effects	marker_finder	visualize_results	run_lineage_profiler	run_goeliteexon	0.05	NA	NA	NA	NA	1	yes	yes	log	NA	NA	constitutive probesets	yes	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediatelyAltMouse	0.75	NA	NA	NA	NA	1	yes	NA	log	NA	NA	NA	yes	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediatelygene	0.05	NA	NA	NA	NA	1	yes	yes	log	NA	NA	constitutive probesets	yes	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediately3'array	NA	NA	NA	NA	NA	NA	NA	NA	log	NA	None	NA	no	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediatelyjunction	0.05	NA	NA	NA	NA	1	yes	yes	log	NA	NA	constitutive probesets	yes	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediatelyRNASeq	NA	1	200	5	0.5	5	yes	yes	non-log	RPKM	NA	known exons	no	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediately
+garray_type	dabg_p	rpkm_threshold	gene_exp_threshold	exon_exp_threshold	exon_rpkm_threshold	expression_threshold	perform_alt_analysis	analyze_as_groups	expression_data_format	normalize_feature_exp	normalize_gene_data	avg_all_for_ss	include_raw_data	probability_algorithm	FDR_statistic	batch_effects	marker_finder	visualize_results	run_lineage_profiler	run_goeliteexon	0.05	NA	NA	NA	NA	1	yes	yes	log	NA	NA	constitutive probesets	yes	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediatelyAltMouse	0.75	NA	NA	NA	NA	1	yes	NA	log	NA	NA	NA	yes	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediatelygene	0.05	NA	NA	NA	NA	1	yes	yes	log	NA	NA	constitutive probesets	yes	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediately3'array	NA	NA	NA	NA	NA	NA	NA	NA	log	NA	None	NA	no	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediatelyjunction	0.05	NA	NA	NA	NA	1	yes	yes	log	NA	NA	constitutive probesets	yes	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediatelyRNASeq	NA	1	200	5	0.5	5	yes	yes	non-log	RPKM	NA	known exons	no	moderated t-test	Benjamini-Hochberg	no	yes	yes	yes	run immediately

diff --git a/ExpressionBuilder.py b/ExpressionBuilder.py
@@ -185,6 +185,14 @@ def calculate_expression_measures(expr_input_dir,expr_group_dir,experiment_name,
         ### differentiate data from column headers
         if x == 1:
             fold_data = fold_data[1:]; fold_data2=[]
+            """
+            if len(array_names) != len(fold_data):
+                diff = len(fold_data)-len(fold_data)
+                fold_data+=diff*['']
+            if arrayid == 'FOXN4|1/2|10F06_ENSP00000299162':
+                print fold_data;sys.exit()
+            """
+
             for fold in fold_data:
                 fold = string.replace(fold,'"','')
                 try:
@@ -3133,19 +3141,40 @@ def filterByLocalJunctionExp(gene,features):
 
                     if prior_gene == '!ENSG00000198001': ### For testing
                         novel_junc_count = 0
+                        all_junc_count = 0
                         for junc in feature_exp_db:
                             if "_" in junc: novel_junc_count+=1
-                        if novel_junc_count>5000:
+                            all_junc_count+=1
+                        if novel_junc_count>1000:
                             ### Indicates genomic variation resulting in broad diversity
                             ### Will prevent function from running in a reasonable amount of time
+                            #print "skipping"
                             pass
                         else:
+                            start_time = time.time()
+                            #print novel_junc_count, all_junc_count, prior_gene,
                             filterByLocalJunctionExp(prior_gene,feature_exp_db)
+                            end_time = time.time(); time_diff = int(end_time-start_time)
+                            #print time_diff
                         #try: gene_junction_denom[prior_gene] = [max(value) for value in zip(*gene_junction_denom[prior_gene])] # sum the junction counts for all junctions across the gene
                         #except Exception: pass
                     if platform == 'RNASeq':
-
-                        filterByLocalJunctionExp(prior_gene,feature_exp_db)
+                        novel_junc_count = 0
+                        all_junc_count = 0
+                        for junc in feature_exp_db:
+                            if "_" in junc: novel_junc_count+=1
+                            all_junc_count+=1
+                        if novel_junc_count>1000:
+                            ### Indicates genomic variation resulting in broad diversity
+                            ### Will prevent function from running in a reasonable amount of time
+                            #print "skipping"
+                            pass
+                        else:
+                            start_time = time.time()
+                            #print novel_junc_count, all_junc_count, prior_gene,
+                            filterByLocalJunctionExp(prior_gene,feature_exp_db)
+                            end_time = time.time(); time_diff = int(end_time-start_time)
+                            #print time_diff
                     else:
                         compareJunctionExpression(prior_gene)
                     feature_exp_db={}
@@ -3170,7 +3199,7 @@ def filterByLocalJunctionExp(gene,features):
             except Exception: graphic_links=[]
     """
     print len(exported)/2,'junctions exported' #,len(retained_introns)/2, 'retained introns exported...'
-    return
+    return []
 
 def getGeneAnnotations(species):
     gene_annotations={}

diff --git a/RNASeq.py b/RNASeq.py
@@ -5679,8 +5679,8 @@ def predictCellTypesFromClusters(icgs_groups_path, goelite_path):
     column_method = 'hopach'
     species = 'Hs'
     excludeCellCycle = False
-    icgs_groups_path='/Volumes/salomonis2/CCHMC-Collaborations/Rafi-Kopan-10X-Rhesus/10X-Kopan-Monkey-Kidney-Cortex-Nuclei-20190506-3v3rhe/10X-Kopan-Monkey-Kidney-Cortex-Nuclei/outs/soupX-without_GENEL-LIST-0.5/10X-Kopan-Monkey-Kidney-Cortex-Nuclei-0.5_matrix_CPTT/ICGS-NMF_cosine_cc/FinalGroups.txt'
-    goelite_path='/Volumes/salomonis2/CCHMC-Collaborations/Rafi-Kopan-10X-Rhesus/10X-Kopan-Monkey-Kidney-Cortex-Nuclei-20190506-3v3rhe/10X-Kopan-Monkey-Kidney-Cortex-Nuclei/outs/soupX-without_GENEL-LIST-0.5/10X-Kopan-Monkey-Kidney-Cortex-Nuclei-0.5_matrix_CPTT/ICGS-NMF_cosine_cc/GO-Elite/clustering/exp.FinalMarkerHeatmap_all/GO-Elite_results/pruned-results_z-score_elite.txt'
+    icgs_groups_path='/Users/saljh8/Downloads/Correlation_files_BRCA/ICGS-NMF/FinalGroups.txt'
+    goelite_path='/Users/saljh8/Downloads/Correlation_files_BRCA/ICGS-NMF/GO-Elite/clustering/exp.FinalMarkerHeatmap_all/GO-Elite_results/pruned-results_z-score_elite.txt'
     predictCellTypesFromClusters(icgs_groups_path, goelite_path);sys.exit()
     platform = 'RNASeq'; graphic_links=[('','/Volumes/HomeBackup/CCHMC/PBMC-10X/ExpressionInput/SamplePrediction/DataPlots/Clustering-33k_CPTT_matrix-CORRELATED-FEATURES-iterFilt-hierarchical_cosine_cosine.txt')]
     """
@@ -5690,7 +5690,7 @@ def predictCellTypesFromClusters(icgs_groups_path, goelite_path):
     """
     import UI; import multiprocessing as mlp
 
-    #runKallisto('Mm','BoneMarrow','/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/altanalyze/Mm-FASTQ','/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/altanalyze/Mm-FASTQ',mlp);sys.exit()
+    runKallisto('Mm','ALP-ILC','/Volumes/salomonis2/PublicDatasets/GSE113765-ILC-Mm/bulk-RNASeq/','/Volumes/salomonis2/PublicDatasets/GSE113765-ILC-Mm/bulk-RNASeq/',mlp);sys.exit()
     runKallisto('Hs','BreastCancer','/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/BreastCancerDemo/FASTQs/input','/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/BreastCancerDemo/FASTQs/input',mlp);sys.exit()
 
     results_file = '/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/l/July-2017/PSI/test/Clustering-exp.round2-Guide3-hierarchical_cosine_correlation.txt'

diff --git a/UI.py b/UI.py
@@ -2539,7 +2539,7 @@ def checkbuttoncallback(tag,state,checkbuttoncallback=self.checkbuttoncallback,o
             quit_win.pack(side = 'right', padx =10, pady = 5)
 
             button_text = 'Help'
-            url = 'http://www.altanalyze.org/help_main.htm'; self.url = url
+            url = 'https://altanalyze.readthedocs.io/en/latest/'; self.url = url
             pdf_help_file = 'Documentation/AltAnalyze-Manual.pdf'; pdf_help_file = filepath(pdf_help_file); self.pdf_help_file = pdf_help_file
 
             try: help_button = Button(self._parent, text=button_text, command=self.GetHelpTopLevel); help_button.pack(side = 'left', padx = 5, pady = 5)

diff --git a/build_scripts/setup_binary.py b/build_scripts/setup_binary.py
@@ -8,7 +8,7 @@
 
 _script = 'AltAnalyze.py'
 _appName = "AltAnalyze"
-_appVersion = '2.1.3'
+_appVersion = '2.1.4.1'
 _appDescription = "AltAnalyze is a freely available, open-source and cross-platform program that allows you to processes raw bulk or single-cell RNASeq and "
 _appDescription +="microarray data, identify predicted alternative splicing or alternative promoter changes and "
 _appDescription +="view how these changes may affect protein sequence, domain composition, and microRNA targeting."
@@ -68,6 +68,7 @@
         options = {"py2app":
                     {"excludes": excludes,
                      "includes": includes,
+                     'plist': 'Info.plist',
                      #"frameworks": frameworks,
                      #"resources": resources,
                      #"argv_emulation": True,

diff --git a/import_scripts/ChromiumProcessing.py b/import_scripts/ChromiumProcessing.py
@@ -13,7 +13,7 @@
 except:
     print ('Missing the h5py library (hdf5 support)...')
 
-def import10XSparseMatrix(matrices_dir,genome,dataset_name, expFile=None, log=True):
+def import10XSparseMatrix(matrices_dir,genome,dataset_name, expFile=None, log=True, geneIDs=False):
     start_time = time.time()
 
     if '.h5' in matrices_dir:
@@ -50,9 +50,11 @@ def import10XSparseMatrix(matrices_dir,genome,dataset_name, expFile=None, log=Tr
             barcodes = [row[0] for row in csv.reader(gzip.open(barcodes_path), delimiter="\t")]
         else:
             gene_ids = [row[0] for row in csv.reader(open(genes_path), delimiter="\t")]
-            print gene_ids[0:10]
             gene_names = [row[1] for row in csv.reader(open(genes_path), delimiter="\t")]
             barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")]
+
+    if geneIDs:
+        gene_names = gene_ids       
     #barcodes = map(lambda x: string.replace(x,'-1',''), barcodes) ### could possibly cause issues with comparative analyses
     matrices_dir = os.path.abspath(os.path.join(matrices_dir, os.pardir))
 
@@ -128,15 +130,19 @@ def calculateCPTT(val,barcode_sum):
     filter_file=None
     genome = 'hg19'
     dataset_name = '10X_filtered'
+    geneID = False
     if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
         print "Insufficient options provided";sys.exit()
         #Filtering samples in a datasets
         #python 10XProcessing.py --i /Users/test/10X/outs/filtered_gene_bc_matrices/ --g hg19 --n My10XExperiment
     else:
-        options, remainder = getopt.getopt(sys.argv[1:],'', ['i=','g=','n='])
+        options, remainder = getopt.getopt(sys.argv[1:],'', ['i=','g=','n=','geneID='])
         #print sys.argv[1:]
         for opt, arg in options:
             if opt == '--i': matrices_dir=arg
             elif opt == '--g': genome=arg
             elif opt == '--n': dataset_name=arg
-    import10XSparseMatrix(matrices_dir,genome,dataset_name)
+            elif opt == '--geneID':
+                geneID = True
+
+    import10XSparseMatrix(matrices_dir,genome,dataset_name,geneIDs = geneID)
diff --git a/import_scripts/mergeFiles.py b/import_scripts/mergeFiles.py
@@ -72,7 +72,12 @@ def combineAllLists(files_to_merge,original_filename,includeColumns=False):
             file = string.split(filename,'\\')[-1][:-4]
         for line in open(fn,'rU').xreadlines():         
             data = cleanUpLine(line)
-            t = string.split(data,'\t')
+            if '\t' in data:
+                t = string.split(data,'\t')
+            elif ',' in data:
+                t = string.split(data,',')
+            else:
+                t = string.split(data,'\t')
             if x==0:
                 if data[0]!='#':
                     x=1
@@ -201,7 +206,12 @@ def combineUniqueAllLists(files_to_merge,original_filename):
             file = string.split(filename,'\\')[-1][:-4]
         for line in open(fn,'rU').xreadlines():         
             data = cleanUpLine(line)
-            t = string.split(data,'\t')
+            if '\t' in data:
+                t = string.split(data,'\t')
+            elif ',' in data:
+                t = string.split(data,',')
+            else:
+                t = string.split(data,'\t')
             if x==0:
                 if data[0]!='#':
                     x=1

diff --git a/import_scripts/sampleIndexSelection.py b/import_scripts/sampleIndexSelection.py
@@ -145,9 +145,16 @@ def filterFile(input_file,output_file,filter_names,force=False,calculateCentroid
             means={}
             for cluster in group_index_db:
                 #### group_index_db[cluster] is all of the indeces for samples in a noted group, cluster is the actual cluster name (not number)
-                try: mean=statistics.avg(map(lambda x: float(filtered_values[x]), group_index_db[cluster]))
-                except:
-                    continue
+                raw_values = map(lambda x: filtered_values[x], group_index_db[cluster])
+                raw_values2=[]
+                for vx in raw_values:
+                    if vx != '':
+                        raw_values2.append(float(vx))
+
+                if len(raw_values2)>2:
+                    mean=statistics.avg(raw_values2)
+                else:
+                    mean = ""
                 #mean = map(lambda x: filtered_values[uid][x], group_index_db[cluster]) ### Only one value
                 means[cluster]=mean
                 mean_matrix.append(str(mean))

diff --git a/stats_scripts/cellHarmony.py b/stats_scripts/cellHarmony.py
@@ -78,7 +78,7 @@ def manage_louvain_alignment(species,platform,query_exp_file,exp_output,
         ref = reference
         query = query_exp_file
 
-    louvain_results = cluster_corr.find_nearest_cells(ref,
+    louvain_results, ref_results = cluster_corr.find_nearest_cells(ref,
                     query,
                     gene_list=gene_list,
                     num_neighbors=10,
@@ -87,6 +87,7 @@ def manage_louvain_alignment(species,platform,query_exp_file,exp_output,
                     min_cluster_correlation=-1,
                     genome=species)
     cluster_corr.write_results_to_file(louvain_results, output_classification_file, labels=customLabels)
+    cluster_corr.write_results_to_file(ref_results, output_classification_file[:-4]+'-reference.txt', labels=customLabels)
 
     try:
         LineageProfilerIterate.harmonizeClassifiedSamples(species, reference, query_exp_file, output_classification_file,fl=fl)

diff --git a/stats_scripts/cluster_corr.py b/stats_scripts/cluster_corr.py
@@ -111,16 +111,16 @@ def add_labels(barcode):
             print("\t".join( ("Query Barcode", "Ref Barcode", "Correlation", "Query Partition", "Ref Partition") ), file=f)
             for q in results.keys():
                 print("\t".join( (q, 
-                    results[q]['barcode'], 
+                    results[q]['barcode'].replace('.Reference',''), 
                     str(results[q]['correlation']), 
                     str(results[q]['query_partition']),
                     str(results[q]['ref_partition'])) ), file=f)
     else:
         with open(filename, 'w') as f:
             print("\t".join( ("Query Barcode", "Ref Barcode", "Correlation", "Query Partition", "Ref Partition", "Label") ), file=f)
             for q in results.keys():
-                print("\t".join( (q, 
-                    results[q]['barcode'], 
+                print("\t".join( (q.replace('.Reference',''), 
+                    results[q]['barcode'].replace('.Reference',''), 
                     str(results[q]['correlation']), 
                     str(results[q]['query_partition']),
                     str(results[q]['ref_partition']),

diff --git a/stats_scripts/preAligned.py b/stats_scripts/preAligned.py
@@ -98,7 +98,10 @@ def exportCellClassifications(output_file,query_cells,filtered_query_cells,repre
         CI = query_cells[query_barcode]
         cluster_number = CI.ClusterNumber()
         label = CI.Label()
-        ref_barcode = representative_refcluster_cell[label][-1]
+        try:
+            ref_barcode = representative_refcluster_cell[label][-1]
+        except:
+            continue
         values = [query_barcode,ref_barcode,'1.0',cluster_number,cluster_number,label]
         o.write(string.join(values,'\t')+'\n')
     o.close()
@@ -153,29 +156,40 @@ def importCelltoClusterAnnotations(filename):
             t = string.split(data,',')
         if firstRow:
             ci = t.index('cell_id')
-            cn = t.index('cluster_number')
+            try: cn = t.index('cluster_number')
+            except: cn = 'False'
             try: cm = t.index('cluster_name')
-            except: cm = False
+            except: cm = 'False'
+            try: cnm = t.index('ClustNameNum')
+            except: cnm = 'False'
+            try: cnm = t.index('label')
+            except: pass
             dn = t.index('dataset_name')
             dt = t.index('dataset_type')
             firstRow = False
         else:
             cell_id = t[ci]
-            cluster_number = t[cn]
+            try: cluster_number = t[cn]
+            except: cluster_number = 'False'
             dataset_name = t[dn]
             dataset_type = t[dt]
-            if cm != False:
+            if cnm !='False':
+                label = t[cnm]
+                if cluster_number == 'False':
+                    cluster_number = label
+            elif cm != False:
                 cluster_name = t[cm]
                 label = cluster_name + '_c'+cluster_number
+            elif cluster_number == False:
+                label = t[cm]
             else:
                 label = 'c'+cluster_number
-
             if string.lower(dataset_type)[0] == 'r':
                 dataset_type = 'Reference'
                 reference_dataset = dataset_name
                 CI = CellInfo(cell_id, cluster_number, dataset_name, dataset_type, label)
                 refererence_cells[cell_id]=CI
-            else:
+            elif string.lower(dataset_type)[0] == 'q':
                 dataset_type = 'Query'
                 query_dataset = dataset_name
                 CI = CellInfo(cell_id, cluster_number, dataset_name, dataset_type, label)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		array_type dabg_p rpkm_threshold gene_exp_threshold exon_exp_threshold exon_rpkm_threshold expression_threshold perform_alt_analysis analyze_as_groups expression_data_format normalize_feature_exp normalize_gene_data avg_all_for_ss include_raw_data probability_algorithm FDR_statistic batch_effects marker_finder visualize_results run_lineage_profiler run_goeliteexon 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediatelyAltMouse 0.75 NA NA NA NA 1 yes NA log NA NA NA yes moderated t-test Benjamini-Hochberg no yes yes yes run immediatelygene 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately3'array NA NA NA NA NA NA NA NA log NA None NA no moderated t-test Benjamini-Hochberg no yes yes yes run immediatelyjunction 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediatelyRNASeq NA 1 200 5 0.5 5 yes yes non-log RPKM NA known exons no moderated t-test Benjamini-Hochberg no yes yes yes run immediately
		garray_type dabg_p rpkm_threshold gene_exp_threshold exon_exp_threshold exon_rpkm_threshold expression_threshold perform_alt_analysis analyze_as_groups expression_data_format normalize_feature_exp normalize_gene_data avg_all_for_ss include_raw_data probability_algorithm FDR_statistic batch_effects marker_finder visualize_results run_lineage_profiler run_goeliteexon 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediatelyAltMouse 0.75 NA NA NA NA 1 yes NA log NA NA NA yes moderated t-test Benjamini-Hochberg no yes yes yes run immediatelygene 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediately3'array NA NA NA NA NA NA NA NA log NA None NA no moderated t-test Benjamini-Hochberg no yes yes yes run immediatelyjunction 0.05 NA NA NA NA 1 yes yes log NA NA constitutive probesets yes moderated t-test Benjamini-Hochberg no yes yes yes run immediatelyRNASeq NA 1 200 5 0.5 5 yes yes non-log RPKM NA known exons no moderated t-test Benjamini-Hochberg no yes yes yes run immediately
Expand Down