Skip to content

Commit

Permalink
10/25/2020
Browse files Browse the repository at this point in the history
-Updated UniProt update process due to file format changes
-Updated NCBI eUTILS bug handling
-Added option for building of exon.bed in CLI/GUI
-Blocked networkX graph export when producing heatmaps (backend conflict)
  • Loading branch information
nsalomonis committed Oct 25, 2020
1 parent c1ca4a8 commit 5f0eb6c
Show file tree
Hide file tree
Showing 8 changed files with 116 additions and 35 deletions.
20 changes: 18 additions & 2 deletions AltAnalyze.py
Original file line number Diff line number Diff line change
Expand Up @@ -5529,7 +5529,11 @@ def AltAnalyzeMain(expr_var,alt_var,goelite_var,additional_var,exp_file_location
analysisType = ['exon','junction','reference']
#analysisType = ['junction']
#print [fl.multiThreading()]
multiBAMtoBED.parallelBAMProcessing(bam_dir,refExonCoordinateFile,outputExonCoordinateRefBEDfile,analysisType=analysisType,useMultiProcessing=fl.multiThreading(),MLP=mlp,root=root)
try: useExonReads = fl.UseExonReads(); print 'useExonReads',[useExonReads]; print 'multiThreading2',[fl.multiThreading()]
except: useExonReads = False
multiBAMtoBED.parallelBAMProcessing(bam_dir,refExonCoordinateFile,
outputExonCoordinateRefBEDfile,analysisType=analysisType,useMultiProcessing=fl.multiThreading(),
useExonReads=useExonReads,MLP=mlp,root=root)

biotypes = RNASeq.alignExonsAndJunctionsToEnsembl(species,exp_file_location_db,dataset,Multi=mlp)

Expand Down Expand Up @@ -6284,6 +6288,7 @@ def commandLineRun():
referenceFull=None
k=None
labels=None
useExonReads=False

original_arguments = sys.argv
arguments=[]
Expand Down Expand Up @@ -6351,7 +6356,8 @@ def commandLineRun():
'fold=','performDiffExp=','centerMethod=', 'k=','bamdir=',
'downsample=','query=','referenceFull=', 'maskGroups=',
'elite_dir=','numGenesExp=','numVarGenes=','accessoryAnalyses=',
'dataFormat=','geneTPM=','markerPearsonCutoff=', 'additionalAnalyses='])
'dataFormat=','geneTPM=','markerPearsonCutoff=', 'additionalAnalyses=',
'useExonReads='])
except Exception:
print traceback.format_exc()
print "There is an error in the supplied command-line arguments (each flag requires an argument)"; sys.exit()
Expand Down Expand Up @@ -6486,6 +6492,11 @@ def commandLineRun():
if multiThreading == 'yes': multiThreading = True
elif 'rue' in multiThreading: multiThreading = True
else: multiThreading = False
elif opt == '--useExonReads':
if string.lower(arg) == 'no' or string.lower(arg) == 'false':
useExonReads = False
else:
useExonReads = True

if perform_tests != False:
### Requires the mouse RNASeq database
Expand Down Expand Up @@ -6749,6 +6760,7 @@ def commandLineRun():
fl.setArrayType(array_type)
fl.setOutputDir(root_dir)
fl.setMultiThreading(multiThreading)
fl.setUseExonReads(useExonReads)
exp_file_location_db={}; exp_file_location_db[exp_name]=fl

### Assign variables needed to run Kallisto from FASTQ files
Expand All @@ -6768,6 +6780,7 @@ def commandLineRun():
#python AltAnalyze.py --runICGS yes --platform "RNASeq" --species Mm --column_method hopach --rho 0.4 --ExpressionCutoff 1 --FoldDiff 4 --SamplesDiffering 1 --excludeCellCycle strict --output /Users/saljh8/Desktop/Grimes/GEC14074 --expname test --bedDir /Users/saljh8/Desktop/Grimes/GEC14074 --multiProcessing no
fl.setCELFileDir(cel_file_dir)
fl.setMultiThreading(multiThreading)
fl.setUseExonReads(useExonReads)
fl.setExonBedBuildStatus('no')
fl.setFeatureNormalization('RPKM')
fl.setArrayType(array_type)
Expand Down Expand Up @@ -7934,6 +7947,7 @@ def commandLineRun():
fl.setArrayType(array_type)
fl.setOutputDir(root_dir)
fl.setMultiThreading(multiThreading)
fl.setUseExonReads(useExonReads)
exp_file_location_db={}; exp_file_location_db[exp_name]=fl

### Assign variables needed to run Kallisto from FASTQ files
Expand Down Expand Up @@ -8251,6 +8265,7 @@ def commandLineRun():
fl = UI.ExpressionFileLocationData('','','',''); fl.setExonBedBuildStatus('yes'); fl.setFeatureNormalization('none')
fl.setCELFileDir(cel_file_dir); fl.setArrayType(array_type); fl.setOutputDir(output_dir)
fl.setMultiThreading(multiThreading)
fl.setUseExonReads(useExonReads)
exp_file_location_db={}; exp_file_location_db[dataset_name]=fl; parent_dir = output_dir
perform_alt_analysis = 'expression'

Expand Down Expand Up @@ -8375,6 +8390,7 @@ def commandLineRun():
fl.setBatchEffectRemoval(batch_effects)
fl.setChannelToExtract(channel_to_extract)
fl.setMultiThreading(multiThreading)
fl.setUseExonReads(useExonReads)
try: fl.setExcludeLowExpressionExons(excludeNonExpExons)
except Exception: fl.setExcludeLowExpressionExons(True)
if 'other' in manufacturer or 'Other' in manufacturer:
Expand Down
5 changes: 3 additions & 2 deletions Config/options.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ dataset_name Give a name to this dataset enter InputCELFiles --- --- --- ---
input_cel_dir Select the CEL file containing folder folder InputCELFiles --- --- --- --- --- --- ---
input_fastq_dir (optional) Select fastq files to run in Kallisto folder InputCELFiles NA NA NA NA NA --- NA
output_CEL_dir Select an AltAnalyze result output directory folder InputCELFiles --- --- --- --- --- --- ---
multithreading Use multithreading for read genomic annotation comboBox InputCELFiles no NA NA NA NA NA yes|no NA
build_exon_bedfile Build exon coordinate bed file to obtain BAM file exon counts\k(see the online tutorial for additional details and information) single-checkbox InputCELFiles NA NA NA NA NA --- NA
multithreading Use multithreading for read genomic annotation comboBox InputCELFiles no NA NA NA NA NA no|yes NA
build_exon_bedfile Build exon coordinate bed file to obtain BAM file exon counts\k(see the online tutorial for additional details and information) single-checkbox InputCELFiles NA NA NA NA NA NA NA
useExonReads Export exon bed files (will slow down analyses) single-checkbox InputCELFiles NA NA NA NA NA --- NA
channel_to_extract Extract data from the following channels comboBox InputCELFiles NA NA NA green|red|green/red ratio|red/green ratio NA NA NA
remove_xhyb Remove probesets that have large cross-hybridization scores single-checkbox InputCELFiles --- --- NA NA NA NA NA
input_cdf_file Select the PGF library file for your array (required) file InputLibraryFiles note: the PGF file is apart of the standard library files for this array. This\kdirectory needs to also contain the CLF and BGP files for the array. These\kfiles can be downloaded from the Affymetrix website. --- --- --- --- --- --- ---
Expand Down
5 changes: 3 additions & 2 deletions LineageProfilerIterate.py
Original file line number Diff line number Diff line change
Expand Up @@ -2744,6 +2744,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl
query_header_proppegated_clusters={}
firstLine = True
exclude=[]
cell_count=0
for line in open(classification_file,'rU').xreadlines():
data = line.rstrip()
data = string.replace(data,'"','')
Expand All @@ -2760,7 +2761,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl
else:
sample = values[0]
score = float(values[score_index])

cell_count+=1
assigned_class = values[class_index]
if sample in original_sampleID_translation:
sample = original_sampleID_translation[sample]
Expand Down Expand Up @@ -2788,7 +2789,7 @@ def importAndCombineExpressionFiles(species,reference_exp_file,query_exp_file,cl
if score<pearsonThreshold: ### Minimum allowed correlation threshold
exclude.append(sample)

print len(exclude), 'cells excluded due to correlation below the indicated threshold'
print len(exclude), 'out of', cell_count, 'cells excluded due to correlation below the indicated threshold'
""" Assign a cluster label to the query sample if applicable """
query_clusters=[]
classified_samples={}
Expand Down
17 changes: 15 additions & 2 deletions UI.py
Original file line number Diff line number Diff line change
Expand Up @@ -4423,9 +4423,13 @@ def set_reference_exp_file(self,exp_file): self._reference_exp_file = exp_file
def setClassificationAnalysis(self, classificationAnalysis): self.classificationAnalysis = classificationAnalysis
def setReturnCentroids(self,returnCentroids): self.returnCentroids = returnCentroids
def setMultiThreading(self, multithreading): self.multithreading = multithreading
def setUseExonReads(self, useExonReads): self.useExonReads = useExonReads
def setVendor(self,vendor): self.vendor = vendor
def setKallistoFile(self,kallisto_exp): self.kallisto_exp = kallisto_exp
def KallistoFile(self): return self.kallisto_exp
def UseExonReads(self):
try: return self.useExonReads
except: return False
def setPredictGroups(self, predictGroups): self.predictGroups = predictGroups
def setPredictGroupsParams(self, predictGroupsObjects): self.predictGroupsObjects = predictGroupsObjects
def setGraphicLinks(self,graphic_links): self.graphic_links = graphic_links ### file location of image files
Expand Down Expand Up @@ -5870,11 +5874,18 @@ def rebootAltAnalyzeGUI(selected_parameters,user_variables):
dataset_name = gu.Results()['dataset_name']
try: remove_xhyb = gu.Results()['remove_xhyb']
except KeyError: remove_xhyb = 'no'
try:
useExonReads = gu.Results()['useExonReads']
if useExonReads == 'yes':
useExonReads = True
else:
useExonReads = False
except: useExonReads = False
try:
multiThreading = gu.Results()['multithreading']
if multiThreading == 'yes': multiThreading = True
else: multiThreading = False
except KeyError: multiThreading = True
except: multiThreading = True
try:
build_exon_bedfile = gu.Results()['build_exon_bedfile']
try: normalize_feature_exp = 'RPKM'
Expand Down Expand Up @@ -6628,7 +6639,7 @@ def importH5(h5_filename):
elif run_from_scratch == 'buildExonExportFiles':
fl = ExpressionFileLocationData('','','',''); fl.setExonBedBuildStatus('yes'); fl.setFeatureNormalization('none')
fl.setCELFileDir(cel_file_dir); fl.setArrayType(array_type); fl.setOutputDir(output_dir); fl.setMultiThreading(multiThreading)
exp_file_location_db={}; exp_file_location_db[dataset_name]=fl; parent_dir = output_dir
exp_file_location_db={}; exp_file_location_db[dataset_name]=fl; parent_dir = output_dir; fl.setUseExonReads(useExonReads)
perform_alt_analysis = 'expression'
elif groups_name in dir_files:
try:
Expand Down Expand Up @@ -6944,6 +6955,8 @@ def importH5(h5_filename):
try: fl.setPredictGroupsParams(gsp)
except Exception: pass
fl.setMultiThreading(multiThreading)
try: fl.setUseExonReads(useExonReads)
except: pass
if run_from_scratch == 'Process Expression file':
fl.setRootDir(output_dir) ### When the data is not primary array data files, allow for option selection of the output directory
fl.setOutputDir(output_dir)
Expand Down
51 changes: 39 additions & 12 deletions build_scripts/ExtractUniProtFunctAnnot.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import os.path
import unique
import copy
import traceback

def filepath(filename):
fn = unique.filepath(filename)
Expand Down Expand Up @@ -167,7 +168,9 @@ def getUniProtURLsForAllSupportedSpecies():
UI.exportDefaultFileLocations(file_location_defaults)

def import_uniprot_db(filename):

fn=filepath(filename); global species_not_imported; species_not_imported=[]
spacer = ' '
ac = '';sm='';id = '';sq = '';osd = ''; gn = '';dr = '';de = '';ft_string = ''; kw = ''; ft = []; ensembl = []; mgi = []; unigene = []; embl = []
ft_call=''; rc=''; go=''; x = 0; y = 0; count = 0
for line in open(fn,'r').xreadlines():
Expand All @@ -193,14 +196,37 @@ def import_uniprot_db(filename):
elif 'GN Name=' in data:
null,gn = string.split(data,'GN Name='); gn = gn[0:-1]
elif data[0:2] == 'FT':
try:
if len(ft_string) > 0 and data[5] == ' ': ft_string = ft_string + data[33:]
elif len(ft_string) > 0 and data[5] != ' ': #if previous loop added data but the next ft line is a new piece of functional data
ft.append(ft_string) #append the previous value
#"""
#if '/note=' in data or '/id' in data or '..' in data
data = string.replace(data,'"','')
if len(ft_string) > 0 and data[5] == ' ':
ft_val = data[21:]
if '/note=' in ft_val:
ft_val = string.replace(ft_val,'/note=','')
try: int(ft_val) ### will cause issues
except: ft_string += ' ' + ft_val
elif '..' in ft_val:
ft_val = string.replace(ft_val,'..',' ')
ft_string += ' ' + ft_val
else:
try:
site = str(int(data[21:])+1)
ft_string += ' ' + ft_val + ' '+ site ### single AA site increment second position by 1AA
except: pass
elif len(ft_string) > 0 and data[5] != ' ': #if previous loop added data but the next ft line is a new piece of functional data
ft.append(ft_string) #append the previous value
if '..' in data:
data = string.replace(data,'..',' ')
try:
site = str(int(data[21:])+1)
ft_string = data[5:]+' '+site ### single AA site increment second position by 1AA
except:
ft_string = data[5:]
else: ft_string = ft_string + data[5:]
except IndexError:
print ft_string;kill
else:
if '..' in data:
data = string.replace(data,'..',' ')
ft_string = ft_string + data[5:]

elif data[0:2] == 'CC': ###grab function description information
if '-!-' in data: x=0;y=0
if x == 1: ft_call = ft_call + data[8:]
Expand Down Expand Up @@ -258,12 +284,13 @@ def import_uniprot_db(filename):
except KeyError: uniprot_ensembl_db[secondary_ac]=[ens]

ensembl += alternate_ensembls

y = UniProtAnnotations(id,ac,sq,ft_list2,ensembl,gn,file_type,de,embl,unigene,mgi,ft_call,class_def,cellular_components)
uniprot_db[id] = y
else: species_not_imported.append(osd)
ac = '';id = '';sq = '';osd = '';gn = '';dr = '';de = ''; ft_call=''; rc='';sm='';go=''; kw=''
ft_string = '';ft = []; ensembl = []; mgi = []; unigene = []; embl = []

#print ft_list2;sys.exit()
x+=1
print "Number of imported swissprot entries:", len(uniprot_db)

Expand Down Expand Up @@ -349,7 +376,7 @@ def Sequence(self): return self._sequence
def Name(self): return self._name
def FTList(self):
new_FTList = [] ### Transform this set of feature information into objects
exlcusion_list = ['CHAIN','VARIANT','CONFLICT','VAR_SEQ']
exlcusion_list = ['CHAIN','VARIANT','CONFLICT','VAR_SEQ','MUTAGEN','INIT_MET']
for ft_entry in self._ft_list:
try:
if len(ft_entry)>3: feature, start, stop, description = ft_entry
Expand Down Expand Up @@ -426,7 +453,7 @@ def export():
if 'T0' not in ens_gene and 'P0' not in ens_gene: ### Exclude protein and transcript IDs
custom_annot=string.join([ens_gene,y.CellularComponent(), y.ClassDefinition(),gn,de,id,ac,unigene],'\t')+'\n'
if len(y.CellularComponent())>1 or len(y.ClassDefinition())>1: custom_annotations[ens_gene] = custom_annot
#print ft_list;sys.exit()
if len(ft_list)>0:
for dd in ft_list: ### Export domain annotations
try:
Expand Down Expand Up @@ -476,7 +503,7 @@ def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_
uniprot_ens_location_built = string.replace(uniprot_ens_location_built,'uniprot','Uniprot-SWISSPROT')
importEnsemblUniprot(uniprot_ens_location_built)
except Exception: null=[]

### Import UniProt annotations
counts = update.verifyFile(uniprot_location,'counts')
if force == 'no' or counts > 8: import_uniprot_db(uniprot_location)
Expand All @@ -488,7 +515,7 @@ def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_
try: os.remove(gz_filepath) ### Not sure why this works now and not before
except OSError: status = status
import_uniprot_db(uniprot_location)

if add_trembl_annotations == 'yes':
### Import TreMBL annotations
try:
Expand Down
Loading

0 comments on commit 5f0eb6c

Please sign in to comment.