-
Notifications
You must be signed in to change notification settings - Fork 0
/
fasta-tryp.py
239 lines (183 loc) · 11.1 KB
/
fasta-tryp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
#! /usr/bin/python
## fasta-tryp.py
## Sandip Chatterjee
##
## Usage:
## >>python fasta-tryp.py proteome_fasta_file.fasta
##
## For generating a nonredundant peptide database in FASTA format from an input FASTA-formatted protein database
## 1) reads in all proteins from input file (standard FASTA format)
## 2) generates tryptic peptides from each protein based on PeptideCutter rules
## 3) generates nonredundant dictionary of tryptic peptides of length > 6
## 4) outputs a nonredundant peptide file for use in spectral matching searches
## 5) outputs an index file with parent proteins and organisms for each peptide
## (all output on one line; can be searched using grep '>peptide1234|' < *.fastaindex)
###### to do:
###### account for half-tryptic peptides?
import sys
import re
from Bio import SeqIO
if len(sys.argv) != 2:
print 'Need one argument -- a filename'
print 'Proper usage: >>fasta-tryp.py somefile.fasta'
sys.exit()
###################################################################################################
#global peptide dictionary
allPeptides = {}
###################################################################################################
## function trypsinDigest
## input: protein sequence (string, case-insensitive)
## output: tryptic peptides from input protein sequence (list of peptides, upper-case)
##
## Info from Expasy Peptide Cutter (http://web.expasy.org/peptide_cutter/peptidecutter_enzymes.html#Tryps):
## Preferentially cleaves at Arg and Lys in position P1 with higher rates for Arg (Keil, 1992), especially at high pH (but treated equally in the program).
##
## SPECIAL CASES FOR PROLINE AT P1':
## Pro usually blocks the action when found in position P1',
## but not when Lys is in position P1 and Trp is in position P2 at the same time.
## This blocking of cleavage exerted by Pro in position P1' is also negligible when Arg is in position P1 and Met is in position P2 at the same time
##
## SPECIAL CASES FOR LYSINE AT P1:
## Furthermore, if Lys is found in position P1 the following situation considerably block the action of trypsin:
## Either Asp in position P2 and Asp in position P1' or
## Cys in position P2 and Asp in position P1' or
## Cys in position P2 and His in position P1' or
## Cys in position P2 and Tyr in position P1'.
##
## SPECIAL CASES FOR ARGININE AT P1:
## A likewise considerable block of trypsin action is seen when Arg is in P1 and the following situations are found:
## Either Arg in position P2 and His in position P1' or
## Cys in position P2 and Lys in position P1' or
## Arg in position P2 and Arg in position P1'.
def trypsinDigest(protSeq):
trypticProtSeq = protSeq.upper().replace('K','K_').replace('R','R_') ## define all Lys, Arg potential cleavage sites as '_'
## special cases for trypsin cleavage with proline at P1'
trypticProtSeq = trypticProtSeq.replace('_P','P') ## if P1' residue (C-terminal to cleavage site) is Proline, remove cleavage site
trypticProtSeq = trypticProtSeq.replace('WKP','WK_P') ## if P1 is K and P2 is W, Proline at P1' will not block cleavage
trypticProtSeq = trypticProtSeq.replace('MRP','MR_P') ## if P1 is R and P2 is M, Proline at P1' will not block cleavage
## special cases for trypsin cleavage with lysine at P1
trypticProtSeq = trypticProtSeq.replace('DK_D','DKD') ## if P1 is K, P2 is D, and P1' is D, remove cleavage site
trypticProtSeq = trypticProtSeq.replace('CK_D','CKD') ## if P1 is K, P2 is C, and P1' is D, remove cleavage site
trypticProtSeq = trypticProtSeq.replace('CK_H','CKH') ## if P1 is K, P2 is C, and P1' is H, remove cleavage site
trypticProtSeq = trypticProtSeq.replace('CK_Y','CKY') ## if P1 is K, P2 is C, and P1' is Y, remove cleavage site
## special cases for trypsin cleavage with arginine at P1
trypticProtSeq = trypticProtSeq.replace('R_R_H','R_RH') ## if P1 is R, P2 is R, and P1' is H, remove cleavage site
## want to keep R_RH but not RR_H
trypticProtSeq = trypticProtSeq.replace('CR_K','CRK') ## if P1 is R, P2 is C, and P1' is K, remove cleavage site
trypticProtSeq = trypticProtSeq.replace('R_R_R','R_RR') ## if P1 is R, P2 is R, and P1' is R, remove cleavage site
## want to keep R_RR but not RR_R
trypticPeptides = trypticProtSeq.split('_') ## cleave at all sites in protSeq marked by '_'
# trypticPeptides = filter(lambda x: len(x)>6,trypticPeptides) ## only keep tryptic peptides with length > 6
trypticPeptides = [pep for pep in trypticPeptides if len(pep)>6] ## only keep tryptic peptides with length > 6
return trypticPeptides
###################################################################################################
## function parseFastaRecord(fastaRecord)
## input: a single FASTA record from a FASTA protein file (two-item list)
## output: a two-item list with the format [{'NCBIID':VALUE, 'proteinName':VALUE, 'organismID':VALUE, 'protSeq':VALUE, 'protPosition': -2}, [PEPTIDE1, PEPTIDE2, PEPTIDE3, ...]]
## output details: calls trypsinDigest() function using given protein sequence from fastaRecord
def parseFastaRecord(fastaRecord):
parsedFastaRecord = [{},[]]
infoLine = fastaRecord[0]
## parsing infoLine...
## ...for NCBI Protein ID
splitInfoLine1 = infoLine.split('|')
NCBIID = splitInfoLine1[3]
## ...for protein name
splitInfoLine1 = infoLine.split('|')
proteinName = splitInfoLine1[4].split('[')[0] ## remove Organism info (if present)
## ...for organism ID
## use re module below to find text within brackets []: re.findall(r"(?<=\[)[^\[]+(?=\])",infoLine)
## Obtaining protein sequence
protSeq = fastaRecord[1]
## construct dictionary
parsedFastaRecord = [{'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': ''.join(re.findall(r"(?<=\[)[^\[]+(?=\])",infoLine)), 'protSeq': protSeq, 'protPosition': -2}, trypsinDigest(protSeq)]
return parsedFastaRecord
###################################################################################################
## function registerPeptides
## Registers peptides from a single FASTA record in allPeptides dictionary
## input: FASTA record (modified; Python list)
## input list format: [{'NCBIID':VALUE, 'proteinName':VALUE, 'organismID':VALUE, 'protSeq':VALUE}, [PEPTIDE1, PEPTIDE2, PEPTIDE3, ...]]
## output: allPeptides dictionary with added peptide entries
## output list format: {'PEPTIDESEQUENCE':[{'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': organismID, 'protPosition': INT}, ...]}
## output list format: Dictionary in which each Peptide sequence (key) is mapped to a list of length >= 1
## output list format: Each list item is a dictionary of information containing the source of the peptide
## output list format: One peptide may map to one or more proteins
############# protPosition doesn't work properly when there are two identical tryptic peptides in a protein sequence #############
############# but no way to distinguish these by mass spec anyway... #############
def registerPeptides(fastaRecord):
for peptide in fastaRecord[1]:
if peptide not in allPeptides:
allPeptides[peptide] = [] ## initialize list of protein records for new peptide
NCBIIDTemp = fastaRecord[0]['NCBIID']
proteinNameTemp = fastaRecord[0]['proteinName']
organismIDTemp = fastaRecord[0]['organismID']
protPositionTemp = fastaRecord[0]['protSeq'].find(peptide)
allPeptides[peptide].append({'NCBIID': NCBIIDTemp,'proteinName': proteinNameTemp, 'organismID': organismIDTemp,'protPosition': protPositionTemp})
return allPeptides
###################################################################################################
## function readFasta()
## Reads in all entries from a protein FASTA-format file using BioPython's SeqIO module (one entry at a time)
## Calls function parseFastaRecord() for each imported record
## Calls registerPeptides() for each parsedFastaRecord
## input: FASTA-formatted record file
## output: complete allPeptides dictionary
def readFasta():
for seq_record in SeqIO.parse(sys.argv[1],'fasta'):
allPeptides = registerPeptides(parseFastaRecord([seq_record.description,str(seq_record.seq)]))
return allPeptides
###################################################################################################
## function outputFasta(allPeptides)
## Outputs contents of allPeptides dictionary to FASTA formatted-file
## input: complete allPeptides dictionary
## output: FASTA file and index file
## format of FASTA file:
## >PEPTIDE1
## TRYPTICPEPTIDESEQUENCE1
## >PEPTIDE2
## TRYPTICPEPTIDESEQUENCE2
## ...
## format of index file:
## >PEPTIDE1
## [{'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': organismID, 'protPosition': INT}]
## >PEPTIDE2
## [{'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': organismID, 'protPosition': INT},
## {'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': organismID, 'protPosition': INT}]
## >PEPTIDE3
## [{'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': organismID, 'protPosition': INT}]
## >PEPTIDE4
## [{'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': organismID, 'protPosition': INT},
## {'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': organismID, 'protPosition': INT},
## {'NCBIID': NCBIID, 'proteinName': proteinName, 'organismID': organismID, 'protPosition': INT}]
## ...
def outputFasta(allPeptides):
## generate FASTA file
outputFastaFile = open(sys.argv[1].replace('.fasta','')+'_peptides_nonredundant'+'.fasta', 'w')
# peptideCount = 1
# for key in allPeptides:
# outputFastaFile.write('>peptide'+str(peptideCount)+'\n'+key+'\n')
# peptideCount += 1
for num,key in enumerate(allPeptides.keys()):
outputFastaFile.write('>peptide'+str(num+1)+'\n'+key+'\n')
print "Generated file "+sys.argv[1].replace('.fasta','')+'_peptides_nonredundant'+'.fasta'
outputFastaFile.close()
## generate index file
outputIndexFile = open(sys.argv[1].replace('.fasta','')+'_peptides_nonredundant'+'.fastaindex', 'w')
peptideCount = 1
for key in allPeptides:
outputIndexFile.write('>'+'peptide'+str(peptideCount)+'|') ## pipe character after peptide ID
peptideCount += 1
for item in range(len(allPeptides[key])):
if allPeptides[key][item]['organismID']:
# outputIndexFile.write(''.join([key,',',allPeptides[key][item]['NCBIID'],',',allPeptides[key][item]['proteinName'],',',allPeptides[key][item]['organismID'],',',str(allPeptides[key][item]['protPosition']),'|']))
outputIndexFile.write(key+','+allPeptides[key][item]['NCBIID']+','+allPeptides[key][item]['proteinName']+','+allPeptides[key][item]['organismID']+','+str(allPeptides[key][item]['protPosition'])+'|')
else:
# outputIndexFile.write(''.join([key,',',allPeptides[key][item]['NCBIID'],',',allPeptides[key][item]['proteinName'],',','',',',str(allPeptides[key][item]['protPosition']),'|']))
outputIndexFile.write(key+','+allPeptides[key][item]['NCBIID']+','+allPeptides[key][item]['proteinName']+','+''+','+str(allPeptides[key][item]['protPosition'])+'|')
outputIndexFile.write('\n')
print "Generated file "+sys.argv[1].replace('.fasta','')+'_peptides_nonredundant'+'.fastaindex'
outputIndexFile.close()
###################################################################################################
readFasta()
outputFasta(allPeptides)
print "Finished"
###################################################################################################