10/25/2020

-Updated UniProt update process due to file format changes -Updated NCBI eUTILS bug handling -Added option for building of exon.bed in CLI/GUI -Blocked networkX graph export when producing heatmaps (backend conflict)
nsalomonis · Oct 25, 2020 · 5f0eb6c · 5f0eb6c
1 parent c1ca4a8
commit 5f0eb6c
Show file tree

Hide file tree

Showing 8 changed files with 116 additions and 35 deletions.
diff --git a/AltAnalyze.py b/AltAnalyze.py
@@ -5529,7 +5529,11 @@ def AltAnalyzeMain(expr_var,alt_var,goelite_var,additional_var,exp_file_location
             analysisType = ['exon','junction','reference']
             #analysisType = ['junction']
             #print [fl.multiThreading()]
-            multiBAMtoBED.parallelBAMProcessing(bam_dir,refExonCoordinateFile,outputExonCoordinateRefBEDfile,analysisType=analysisType,useMultiProcessing=fl.multiThreading(),MLP=mlp,root=root)
+            try: useExonReads = fl.UseExonReads(); print 'useExonReads',[useExonReads]; print 'multiThreading2',[fl.multiThreading()]
+            except: useExonReads = False
+            multiBAMtoBED.parallelBAMProcessing(bam_dir,refExonCoordinateFile,
+                    outputExonCoordinateRefBEDfile,analysisType=analysisType,useMultiProcessing=fl.multiThreading(),
+                    useExonReads=useExonReads,MLP=mlp,root=root)
 
           biotypes = RNASeq.alignExonsAndJunctionsToEnsembl(species,exp_file_location_db,dataset,Multi=mlp)
 
@@ -6284,6 +6288,7 @@ def commandLineRun():
     referenceFull=None
     k=None
     labels=None
+    useExonReads=False
 
     original_arguments = sys.argv
     arguments=[]
@@ -6351,7 +6356,8 @@ def commandLineRun():
                                                          'fold=','performDiffExp=','centerMethod=', 'k=','bamdir=',
                                                          'downsample=','query=','referenceFull=', 'maskGroups=',
                                                          'elite_dir=','numGenesExp=','numVarGenes=','accessoryAnalyses=',
-                                                         'dataFormat=','geneTPM=','markerPearsonCutoff=', 'additionalAnalyses='])
+                                                         'dataFormat=','geneTPM=','markerPearsonCutoff=', 'additionalAnalyses=',
+                                                         'useExonReads='])
     except Exception:
         print traceback.format_exc()
         print "There is an error in the supplied command-line arguments (each flag requires an argument)"; sys.exit()
@@ -6486,6 +6492,11 @@ def commandLineRun():
             if multiThreading == 'yes': multiThreading = True
             elif 'rue' in multiThreading: multiThreading = True
             else: multiThreading = False
+        elif opt == '--useExonReads':
+            if string.lower(arg) == 'no' or string.lower(arg) == 'false':
+                useExonReads = False
+            else:
+                useExonReads = True
 
     if perform_tests != False:
         ### Requires the mouse RNASeq database
@@ -6749,6 +6760,7 @@ def commandLineRun():
             fl.setArrayType(array_type)
             fl.setOutputDir(root_dir)
             fl.setMultiThreading(multiThreading)
+            fl.setUseExonReads(useExonReads)
             exp_file_location_db={}; exp_file_location_db[exp_name]=fl
 
             ### Assign variables needed to run Kallisto from FASTQ files
@@ -6768,6 +6780,7 @@ def commandLineRun():
                 #python AltAnalyze.py --runICGS yes --platform "RNASeq" --species Mm --column_method hopach --rho 0.4 --ExpressionCutoff 1 --FoldDiff 4 --SamplesDiffering 1 --excludeCellCycle strict --output  /Users/saljh8/Desktop/Grimes/GEC14074 --expname test --bedDir /Users/saljh8/Desktop/Grimes/GEC14074 --multiProcessing no
                 fl.setCELFileDir(cel_file_dir)
                 fl.setMultiThreading(multiThreading)
+                fl.setUseExonReads(useExonReads)
                 fl.setExonBedBuildStatus('no')
                 fl.setFeatureNormalization('RPKM')
                 fl.setArrayType(array_type)
@@ -7934,6 +7947,7 @@ def commandLineRun():
         fl.setArrayType(array_type)
         fl.setOutputDir(root_dir)
         fl.setMultiThreading(multiThreading)
+        fl.setUseExonReads(useExonReads)
         exp_file_location_db={}; exp_file_location_db[exp_name]=fl
 
         ### Assign variables needed to run Kallisto from FASTQ files
@@ -8251,6 +8265,7 @@ def commandLineRun():
             fl = UI.ExpressionFileLocationData('','','',''); fl.setExonBedBuildStatus('yes'); fl.setFeatureNormalization('none')
             fl.setCELFileDir(cel_file_dir); fl.setArrayType(array_type); fl.setOutputDir(output_dir)
             fl.setMultiThreading(multiThreading)
+            fl.setUseExonReads(useExonReads)
             exp_file_location_db={}; exp_file_location_db[dataset_name]=fl; parent_dir = output_dir
             perform_alt_analysis = 'expression'
 
@@ -8375,6 +8390,7 @@ def commandLineRun():
             fl.setBatchEffectRemoval(batch_effects)
             fl.setChannelToExtract(channel_to_extract)
             fl.setMultiThreading(multiThreading)
+            fl.setUseExonReads(useExonReads)
             try: fl.setExcludeLowExpressionExons(excludeNonExpExons)
             except Exception: fl.setExcludeLowExpressionExons(True)
             if 'other' in manufacturer or 'Other' in manufacturer:

diff --git a/Config/options.txt b/Config/options.txt
@@ -20,8 +20,9 @@ dataset_name	Give a name to this dataset	enter	InputCELFiles				---	---	---	---
 input_cel_dir	Select the CEL file containing folder	folder	InputCELFiles				---	---	---	---	---	---	---
 input_fastq_dir	(optional) Select fastq files to run in Kallisto	folder	InputCELFiles				NA	NA	NA	NA	NA	---	NA
 output_CEL_dir	Select an AltAnalyze result output directory	folder	InputCELFiles				---	---	---	---	---	---	---
-multithreading	Use multithreading for read genomic annotation	comboBox	InputCELFiles			no	NA	NA	NA	NA	NA	yes|no	NA
-build_exon_bedfile	Build exon coordinate bed file to obtain BAM file exon counts\k(see the online tutorial for additional details and information)	single-checkbox	InputCELFiles				NA	NA	NA	NA	NA	---	NA
+multithreading	Use multithreading for read genomic annotation	comboBox	InputCELFiles			no	NA	NA	NA	NA	NA	no|yes	NA
+build_exon_bedfile	Build exon coordinate bed file to obtain BAM file exon counts\k(see the online tutorial for additional details and information)	single-checkbox	InputCELFiles				NA	NA	NA	NA	NA	NA	NA
+useExonReads	Export exon bed files (will slow down analyses)	single-checkbox	InputCELFiles				NA	NA	NA	NA	NA	---	NA
 channel_to_extract	Extract data from the following channels	comboBox	InputCELFiles				NA	NA	NA	green|red|green/red ratio|red/green ratio	NA	NA	NA
 remove_xhyb	Remove probesets that have large cross-hybridization scores 	single-checkbox	InputCELFiles				---	---	NA	NA	NA	NA	NA
 input_cdf_file	Select the PGF library file for your array (required)	file	InputLibraryFiles	note: the PGF file is apart of the standard library files for this array. This\kdirectory needs to also contain the CLF and BGP files for the array. These\kfiles can be downloaded from the Affymetrix website.			---	---	---	---	---	---	---

diff --git a/LineageProfilerIterate.py b/LineageProfilerIterate.py
@@ -2744,6 +2744,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl
     query_header_proppegated_clusters={}
     firstLine = True
     exclude=[]
+    cell_count=0
     for line in open(classification_file,'rU').xreadlines():
         data = line.rstrip()
         data = string.replace(data,'"','')
@@ -2760,7 +2761,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl
         else:
             sample = values[0]
             score = float(values[score_index])
-
+            cell_count+=1
             assigned_class = values[class_index]
             if sample in original_sampleID_translation:
                 sample = original_sampleID_translation[sample]
@@ -2788,7 +2789,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl
             if score<pearsonThreshold: ### Minimum allowed correlation threshold
                 exclude.append(sample)
 
-    print len(exclude), 'cells excluded due to correlation below the indicated threshold'
+    print len(exclude), 'out of', cell_count, 'cells excluded due to correlation below the indicated threshold'
     """ Assign a cluster label to the query sample if applicable """
     query_clusters=[]
     classified_samples={}

diff --git a/UI.py b/UI.py
@@ -4423,9 +4423,13 @@ def set_reference_exp_file(self,exp_file): self._reference_exp_file = exp_file
     def setClassificationAnalysis(self, classificationAnalysis): self.classificationAnalysis = classificationAnalysis
     def setReturnCentroids(self,returnCentroids): self.returnCentroids = returnCentroids
     def setMultiThreading(self, multithreading): self.multithreading = multithreading
+    def setUseExonReads(self, useExonReads): self.useExonReads = useExonReads
     def setVendor(self,vendor): self.vendor = vendor
     def setKallistoFile(self,kallisto_exp): self.kallisto_exp = kallisto_exp
     def KallistoFile(self): return self.kallisto_exp
+    def UseExonReads(self):
+        try: return self.useExonReads
+        except: return False
     def setPredictGroups(self, predictGroups): self.predictGroups = predictGroups
     def setPredictGroupsParams(self, predictGroupsObjects): self.predictGroupsObjects = predictGroupsObjects
     def setGraphicLinks(self,graphic_links): self.graphic_links = graphic_links ### file location of image files
@@ -5870,11 +5874,18 @@ def rebootAltAnalyzeGUI(selected_parameters,user_variables):
                 dataset_name = gu.Results()['dataset_name']
                 try: remove_xhyb = gu.Results()['remove_xhyb']
                 except KeyError: remove_xhyb = 'no'
+                try:
+                    useExonReads = gu.Results()['useExonReads']
+                    if useExonReads == 'yes':
+                        useExonReads = True
+                    else:
+                        useExonReads = False
+                except: useExonReads = False
                 try:
                     multiThreading = gu.Results()['multithreading']
                     if multiThreading == 'yes': multiThreading = True
                     else: multiThreading = False
-                except KeyError: multiThreading = True
+                except: multiThreading = True
                 try:
                     build_exon_bedfile = gu.Results()['build_exon_bedfile']
                     try: normalize_feature_exp = 'RPKM'
@@ -6628,7 +6639,7 @@ def importH5(h5_filename):
         elif run_from_scratch == 'buildExonExportFiles':
                 fl = ExpressionFileLocationData('','','',''); fl.setExonBedBuildStatus('yes'); fl.setFeatureNormalization('none')
                 fl.setCELFileDir(cel_file_dir); fl.setArrayType(array_type); fl.setOutputDir(output_dir); fl.setMultiThreading(multiThreading)
-                exp_file_location_db={}; exp_file_location_db[dataset_name]=fl; parent_dir = output_dir
+                exp_file_location_db={}; exp_file_location_db[dataset_name]=fl; parent_dir = output_dir; fl.setUseExonReads(useExonReads)
                 perform_alt_analysis = 'expression'
         elif groups_name in dir_files:
             try:
@@ -6944,6 +6955,8 @@ def importH5(h5_filename):
         try: fl.setPredictGroupsParams(gsp)
         except Exception: pass
         fl.setMultiThreading(multiThreading)
+        try:  fl.setUseExonReads(useExonReads)
+        except: pass
         if run_from_scratch == 'Process Expression file':
             fl.setRootDir(output_dir) ### When the data is not primary array data files, allow for option selection of the output directory
             fl.setOutputDir(output_dir)

diff --git a/build_scripts/ExtractUniProtFunctAnnot.py b/build_scripts/ExtractUniProtFunctAnnot.py
@@ -21,6 +21,7 @@
 import os.path
 import unique
 import copy
+import traceback
 
 def filepath(filename):
     fn = unique.filepath(filename)
@@ -167,7 +168,9 @@ def getUniProtURLsForAllSupportedSpecies():
     UI.exportDefaultFileLocations(file_location_defaults)
 
 def import_uniprot_db(filename):
+
     fn=filepath(filename); global species_not_imported; species_not_imported=[]
+    spacer = '           '
     ac = '';sm='';id = '';sq = '';osd = ''; gn = '';dr = '';de = '';ft_string = ''; kw = ''; ft = []; ensembl = []; mgi = []; unigene = []; embl = []
     ft_call=''; rc=''; go=''; x = 0; y = 0; count = 0
     for line in open(fn,'r').xreadlines():
@@ -193,14 +196,37 @@ def import_uniprot_db(filename):
         elif 'GN   Name=' in data:
             null,gn = string.split(data,'GN   Name='); gn = gn[0:-1]
         elif data[0:2] == 'FT':
-            try:
-                if len(ft_string) > 0 and data[5] == ' ': ft_string = ft_string + data[33:]
-                elif len(ft_string) > 0 and data[5] != ' ': #if previous loop added data but the next ft line is a new piece of functional data
-                    ft.append(ft_string) #append the previous value
+            #"""
+            #if '/note=' in data or '/id' in data or '..' in data
+            data = string.replace(data,'"','')
+            if len(ft_string) > 0 and data[5] == ' ':
+                ft_val = data[21:]                
+                if '/note=' in ft_val:
+                    ft_val = string.replace(ft_val,'/note=','')
+                    try: int(ft_val) ### will cause issues
+                    except: ft_string += '  ' + ft_val
+                elif '..' in ft_val:
+                    ft_val = string.replace(ft_val,'..','  ')
+                    ft_string += '  ' + ft_val
+                else:
+                    try:
+                        site = str(int(data[21:])+1)
+                        ft_string += '  ' + ft_val + '  '+ site ### single AA site increment second position by 1AA
+                    except: pass
+            elif len(ft_string) > 0 and data[5] != ' ': #if previous loop added data but the next ft line is a new piece of functional data
+                ft.append(ft_string) #append the previous value
+                if '..' in data:
+                    data = string.replace(data,'..','  ')
+                try:
+                    site = str(int(data[21:])+1)
+                    ft_string = data[5:]+'  '+site ### single AA site increment second position by 1AA
+                except: 
                     ft_string = data[5:]
-                else: ft_string = ft_string + data[5:]
-            except IndexError:
-                print ft_string;kill
+            else:
+                if '..' in data:
+                    data = string.replace(data,'..','  ')
+                ft_string = ft_string + data[5:]
+
         elif data[0:2] == 'CC': ###grab function description information
             if '-!-' in data: x=0;y=0
             if x == 1: ft_call = ft_call + data[8:]
@@ -258,12 +284,13 @@ def import_uniprot_db(filename):
                     except KeyError: uniprot_ensembl_db[secondary_ac]=[ens]
 
               ensembl += alternate_ensembls
+
               y = UniProtAnnotations(id,ac,sq,ft_list2,ensembl,gn,file_type,de,embl,unigene,mgi,ft_call,class_def,cellular_components)
               uniprot_db[id] = y
             else: species_not_imported.append(osd)
             ac = '';id = '';sq = '';osd = '';gn = '';dr = '';de = ''; ft_call=''; rc='';sm='';go=''; kw=''
             ft_string = '';ft = []; ensembl = []; mgi = []; unigene = []; embl = []
-
+            #print ft_list2;sys.exit()
             x+=1
     print "Number of imported swissprot entries:", len(uniprot_db)
 
@@ -349,7 +376,7 @@ def Sequence(self): return self._sequence
     def Name(self): return self._name
     def FTList(self):
         new_FTList = [] ### Transform this set of feature information into objects
-        exlcusion_list = ['CHAIN','VARIANT','CONFLICT','VAR_SEQ']
+        exlcusion_list = ['CHAIN','VARIANT','CONFLICT','VAR_SEQ','MUTAGEN','INIT_MET']
         for ft_entry in self._ft_list:
             try:
                 if len(ft_entry)>3: feature, start, stop, description = ft_entry
@@ -426,7 +453,7 @@ def export():
             if 'T0' not in ens_gene and 'P0' not in ens_gene: ### Exclude protein and transcript IDs
                 custom_annot=string.join([ens_gene,y.CellularComponent(), y.ClassDefinition(),gn,de,id,ac,unigene],'\t')+'\n'
                 if len(y.CellularComponent())>1 or len(y.ClassDefinition())>1: custom_annotations[ens_gene] = custom_annot
-                                     
+        #print ft_list;sys.exit()  
         if len(ft_list)>0:
             for dd in ft_list:  ### Export domain annotations
                 try:
@@ -476,7 +503,7 @@ def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_
         uniprot_ens_location_built = string.replace(uniprot_ens_location_built,'uniprot','Uniprot-SWISSPROT')
         importEnsemblUniprot(uniprot_ens_location_built)
     except Exception: null=[]
-    
+
     ### Import UniProt annotations
     counts = update.verifyFile(uniprot_location,'counts')
     if force == 'no' or counts > 8: import_uniprot_db(uniprot_location)
@@ -488,7 +515,7 @@ def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_
             try: os.remove(gz_filepath) ### Not sure why this works now and not before
             except OSError: status = status     
         import_uniprot_db(uniprot_location)
-        
+
     if add_trembl_annotations == 'yes':
         ### Import TreMBL annotations
         try: