-
Notifications
You must be signed in to change notification settings - Fork 10
/
schema.py
469 lines (423 loc) · 17.7 KB
/
schema.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#! /usr/local/bin/python
"""Module for SCHEMA tools.
This module provides core functions used by all the SCHEMA tools.
******************************************************************
Copyright (C) 2005 Allan Drummond, California Institute of Technology
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*******************************************************************
SCHEMA was developed in the laboratory of Frances H. Arnold at the California Institute of Technology.
References:
Voigt, C. et al., "Protein building blocks preserved by recombination," Nature Structural Biology 9(7):553-558 (2002).
Meyer, M. et al., "Library analysis of SCHEMA-guided recombination," Protein Science 12:1686-1693 (2003).
Otey, C. et al., "Functional evolution and structural conservation in chimeric cytochromes P450: Calibrating a structure-guided approach," Chemistry & Biology 11:1-20 (2004)
Silberg, J. et al., "SCHEMA-guided protein recombination," Methods in Enzymology 388:35-42 (2004).
Endelman, J. et al., "Site-directed protein recombination as a shortest-path problem," Protein Engineering, Design & Selection 17(7):589-594 (2005).
"""
import sys, string, random
import pdb
DIGITS_LETTERS = string.digits + string.letters
def alignPDBResidues(residues, aligned_parent_protein, aligned_pdb_protein, library_parent_protein, chain_identifiers):
"""Align the PDB residues to the aligned parent protein sequence, following the parent/PDB alignment."""
residues = [res for res in residues if res.chain in chain_identifiers]
new_residues = []
j = 0
# Prune/add to the residue list to make it match the library_parent_protein
# Assumptions:
# 1) The PDB sequence contains only amino acids, so that there are no explicit gaps.
# 2) The aligned_parent_protein and library_parent_protein differ only in the number of gaps.
# The PDB residues corresponding to sites in library_parent_protein will be accumulated
# in the new_residues list.
new_residues = []
# The process of assembling is somewhat involved. The indices i, j, and k
# indicate the current position in the library parent, parent/PDB alignment and PDB
# residue sequences, respectively. There are four cases:
# a) Library parent has a gap at the current position, and:
# 1) PDB-aligned parent also has a gap, but the parent-aligned PDB does not
# 2) Both have gaps
# b) Library parent has an amino acid at the current position, and:
# 3) PDB-aligned parent does not match the library parent
# 4) PDB-aligned parent matches
# 4.1) Parent-aligned PDB has a gap
# 4.2) Parent-aligned PDB has a residue
# The j index indicates positions in the aligned PDB protein, aligned_pdb_protein
# and aligned_parent_protein
j = 0
# The k index indicates positions in the PDB residue sequence, residues
k = 0
for i in range(len(library_parent_protein)):
li = library_parent_protein[i]
# If the parent has a gap,
if li == '-':
parj = aligned_parent_protein[j]
pdbj = aligned_pdb_protein[j]
if parj == '-' and pdbj != '-': # Parents have gaps, but PDB doesn't
new_residues.append(residues[k])
j += 1
k += 1
elif parj == pdbj == li: # All have gaps
new_residues.append(pdb.Residue(None))
j += 1
else: # parent does not contain a gap at i
# While the aligned parent doesn't match the library parent,
# advance the indices. This brings the sequences back into
# register.
pdbres = pdb.three_to_one_map[residues[k].residue]
#print li, aligned_parent_protein[j], aligned_pdb_protein[j], pdbres
while aligned_parent_protein[j] != li:
pdbres = pdb.three_to_one_map[residues[k].residue]
if aligned_pdb_protein[j] != '-':
k += 1
j += 1
#print "*", li, aligned_parent_protein[j], aligned_pdb_protein[j], pdbres
#print "end"
# Now, if the aligned PDB has a gap, insert a gap
# into the residues list, otherwise insert the residue.
if aligned_pdb_protein[j] == '-':
new_residues.append(pdb.Residue(None))
else:
pdbres = pdb.three_to_one_map[residues[k].residue]
#print i, j, k
#print li, aligned_parent_protein[j], aligned_pdb_protein[j], pdbres
#print library_parent_protein[:i+1]
#print aligned_parent_protein[:j+1]
#print aligned_pdb_protein[:j+1]
#print pdb.sequence(new_residues)
#print pdb.sequence(residues)[:k+1]
# Check to make sure we're adding the expected residue
if aligned_pdb_protein[j] != pdbres:
err_string = "Expected residue %s at aligned position %d based on alignment, but PDB had %s at corresponding residue %d. Aborting..."
raise ValueError, err_string % (aligned_pdb_protein[j], j+1, pdbres, k+1)
new_residues.append(residues[k])
k += 1
j += 1
if j >= len(aligned_pdb_protein) or k >= len(residues):
break
return new_residues
def getSCHEMAContactsWithCrossovers(contacts, parents, crossovers):
"""Get contacts with correction for parental sequence identity
and for fragments."""
fragments = getFragments(crossovers, parents[0])
filtered_contacts = []
# First eliminate contacts between residues in the same fragment. These
# can never be broken by recombination.
for (i, j, ri, rj) in contacts:
same_fragment = False
for (k,l) in fragments:
if i>=k and i<l and j>=k and j<l:
same_fragment = True
# Now eliminate contacts where either residue is absolutely conserved.
# These can never be broken by recombination.
if not same_fragment:
pi = [x[i] for x in parents]
pj = [x[j] for x in parents]
if not (pi.count(pi[0])==len(pi) or pj.count(pj[0])==len(pj)): # if neither i or j absolutely conserved
# This is a bona-fide breakable contact.
filtered_contacts.append((i,j,ri,rj))
return filtered_contacts
def getSCHEMAContacts(contacts, parents):
"""Get contacts with correction for parental sequence identity."""
filtered_contacts = []
# First eliminate contacts between residues in the same fragment. These
# can never be broken by recombination.
for (i, j, ri, rj) in contacts:
# Eliminate contacts where either residue is absolutely conserved.
# These can never be broken by recombination.
pi = [x[i] for x in parents]
pj = [x[j] for x in parents]
# if neither i or j absolutely conserved
if not (pi.count(pi[0])==len(pi) or pj.count(pj[0])==len(pj)):
# This is a bona-fide breakable contact.
filtered_contacts.append((i,j,ri,rj))
return filtered_contacts
def getPDBContacts(residues, contact_distance):
# Get set of residues within contact_distance angstroms in a PDB
contacts = []
# Loop through all residue pairs
for i in range(0,len(residues)-1):
for j in range(i+1, len(residues)):
resi = residues[i]
resj = residues[j]
# If both residues are present (gaps == None)
if resi and resj:
# If residues are within contact_distance angstroms of each other, add as a contact
contact = resi.isContact(resj, contact_distance)
if contact:
contacts.append((i, j, resi, resj))
return contacts
def writeContactFile(contacts, outfile):
outfile.write("# Fields are number, contacting residues i & j (in library coordinates), and residues i & j (in PDB coordinates)\n")
outfile.write("# n i j pdbi pdbj\n")
for k in range(len(contacts)):
(i, j, resi, resj) = contacts[k]
outfile.write("%d\t%d\t%d\t%s\t%s\n" % (k, i, j, resi.res_seq, resj.res_seq))
def readContactFile(f):
contacts = []
for line in f.readlines():
if line[0] == '#': # comment line
continue
flds = line.strip().split()
contacts.append((int(flds[1]), int(flds[2]), flds[3], flds[4]))
return contacts
def checkChimera(chimera_blocks, fragments, parents):
"""Checks a chimera to see if it's valid."""
# First see if there are the proper number of blocks
if len(chimera_blocks) != len(fragments):
return False
# Then ensure that, within each block, the parent index
# is no smaller than 1 or larger than the number of parents.
for i in range(len(chimera_blocks)):
if not (0 < int(chimera_blocks[i]) <= len(parents)):
return False
return True
def base(number, radix):
"""Returns a string representation of the number in the base indicated by radix."""
# Inverse function to int(str,radix) and long(str,radix)
if not 2 <= radix <= 36:
raise ValueError, "radix must be in 2..36"
result = []
addon = result.append
if number < 0:
number = -number
addon('-')
elif number == 0:
addon('0')
_divmod, _abc = divmod, DIGITS_LETTERS
while number:
number, rdigit = _divmod(number, radix)
addon(_abc[rdigit])
result.reverse()
return ''.join(result)
def indexToFragment(index, fragments):
"""Given an index into a full protein sequence, returns the zero-based index of the
fragment containing that position."""
for frag_index in range(len(fragments)):
(i,j) = fragments[frag_index]
if index >= i and index < j:
return frag_index
return None
def getChimeraDisruption(chimera_blocks, contacts, fragments, parents):
"""Takes a chimera block pattern, such as '11213312', and computes the SCHEMA
disruption, the number of contacts broken by recombination."""
parent_indices = [int(c)-1 for c in chimera_blocks]
num_disruptions = 0
for (i,j,ri,rj) in contacts:
frag_i = indexToFragment(i, fragments)
frag_j = indexToFragment(j, fragments)
if parent_indices[frag_i] == parent_indices[frag_j]:
# No disruption possible if both fragments come from the same parent.
continue
pair = (parents[parent_indices[frag_i]][i], parents[parent_indices[frag_j]][j])
# If pair doesn't exist in any parent, it's counted as disruptive
if pair not in [(p[i], p[j]) for p in parents]:
#print chimera_blocks, i+1, j+1
num_disruptions += 1
return num_disruptions
def getChimeraSequence(chimera_blocks, fragments, parents):
"""Converts a chimera block pattern, such as '11213312', into a protein sequence
by assembling fragments from the parents. This sequence may then be used
to compute mutational distances and so on."""
parent_indices = [int(c)-1 for c in chimera_blocks]
chimera = ''
for i in range(len(parent_indices)):
which_parent = parent_indices[i]
(begin, end) = fragments[i]
chimera += parents[which_parent][begin:end]
return chimera
def getChimeraDistance(chimera, parent):
"""Returns the number of mutations necessary to turn the chimera into the parent."""
return len([(caa, paa) for (caa,paa) in zip(chimera, parent) if (caa != paa)])
def getChimeraShortestDistance(chimera_blocks, fragments, parents):
"""Returns the minimum number of mutations necessary to turn the chimera into
at least one of the parent sequences."""
chimera = getChimeraSequence(chimera_blocks, fragments, parents)
ms = [getChimeraDistance(chimera, p) for p in parents]
return min(ms)
def readMultipleSequenceAlignmentFile(f):
"""Reads a multiple sequence alignment file in ALN format."""
parent_dict = {}
keys = []
for line in f.readlines():
if line[0] == '#': # comment line
continue
flds = line.strip().split()
# Skip lines with too little data to be sequence info
if len(flds) < 2:
continue
# Skip fields with incomprehensible data, such as alignment-score lines.
if flds[0][0] not in DIGITS_LETTERS:
continue
key = flds[0]
seq = flds[1]
# Extract the parent sequence
if parent_dict.has_key(key):
parent_dict[key] += seq
else:
keys.append(key)
parent_dict[key] = seq
# Join the sequences and replace dots with dashes to indicate gaps.
parent_list = [(key,''.join(parent_dict[key]).replace('.','-')) for key in keys]
return parent_list
def readCrossoverFile(f):
"""Read a file of crossover positions."""
crossovers = [0]
for line in f.readlines():
line = line.strip()
if line[0] == '#' or line == '': # comment line or empty line
continue
try:
crossovers = [int(x) for x in line.split()]
except ValueError:
print "Non-numeric data found in crossover file:\n", line.split()
crossovers.sort()
return crossovers
def mutationMatrix(fragments, parents):
mut_dict = {}
for f in range(len(fragments)):
(i,j) = fragments[f]
for p in range(len(parents)-1):
for q in range(p+1,len(parents)):
frag_p = parents[p][i:j]
frag_q = parents[q][i:j]
m = len([(a,b) for (a,b) in zip(frag_p,frag_q) if a != b])
mut_dict[(f,p,q)] = m
mut_dict[(f,q,p)] = m
return mut_dict
def getChimeraShortestDistanceLookup(chimera_blocks, fragments, parents, mut_dict):
ms = [0]*len(parents)
for p in range(len(parents)):
for f in range(len(fragments)):
q = int(chimera_blocks[f])-1
if q != p:
key = (f,p,q)
ms[p] += mut_dict[key]
return min(ms)
def averageMutation(fragments, parents):
avg_m = 0.0
num_chimeras = 0
p = len(parents)
n = len(fragments)
# Create a mutation matrix: how many mutations
mut_dict = mutationMatrix(fragments, parents)
for i in xrange(p**n):
# The next two lines turn i into a chimera block pattern
# (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...)
n2c = base(i,p)
chimera_blocks = ''.join(['1']*(n-len(n2c))+['%d'%(int(x)+1,) for x in n2c])
m = getChimeraShortestDistanceLookup(chimera_blocks, fragments, parents, mut_dict)
avg_m += m
num_chimeras += 1
return avg_m/num_chimeras
def averageMutationSampled(fragments, parents, num_samples):
# Estimate the average mutation level by taking num_samples samples.
avg_m = 0.0
num_chimeras = 0
p = len(parents)
n = len(fragments)
# Create a mutation matrix: how many mutations
mut_dict = mutationMatrix(fragments, parents)
library_size = p**n
if num_samples >= library_size: # Might as well be exact!
return averageMutation(fragments, parents)
for i in range(num_samples):
rand_num = random.randint(0,library_size)
# The next two lines turn i into a chimera block pattern
# (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...)
n2c = base(rand_num,p)
chimera_blocks = ''.join(['1']*(n-len(n2c))+['%d'%(int(x)+1,) for x in n2c])
m = getChimeraShortestDistanceLookup(chimera_blocks, fragments, parents, mut_dict)
avg_m += m
num_chimeras += 1
return avg_m/num_chimeras
def averageEnergy(contacts, fragments, parents):
avg_E = 0.0
num_chimeras = 0
p = len(parents)
n = len(fragments)
for i in range(len(parents)**len(fragments)):
# The next two lines turn i into a chimera block pattern
# (e.g., 0 -> '11111111', 1 -> '11111112', 2 -> '11111113'...)
n2c = base(i,p)
chimera_blocks = ''.join(['1']*(n-len(n2c))+['%d'%(int(x)+1,) for x in n2c])
E = getChimeraDisruption(chimera_blocks, contacts, fragments, parents)
avg_E += E
num_chimeras += 1
return avg_E/num_chimeras
def getCrossoversFromFragments(fragments):
"""Turns fragments, which are pairs of 0-based indices, into
1-based crossover indices."""
return [x+1 for (x,y) in fragments[1:]]
def getFragments(crossovers, parent):
"""Turns crossover points, which are 1-based for readability, into (begin,end)
pairs of 0-based indices into the parent sequence."""
xovers = crossovers[:]
if 1 not in xovers:
xovers = [1] + xovers
xovers.append(len(parent)+1)
fragments = [(xovers[i]-1, xovers[i+1]-1) for i in range(len(xovers)-1)]
return fragments
def splitSequence(seq_length, min_length):
"""Splits a sequence at a point randomly chosen from those which generate fragments
no smaller than min_length."""
return random.randint(min_length, seq_length-min_length)
def generateRandomCrossovers(seq_length, num_xovers, min_length):
fragments = generateRandomFragments(seq_length, num_xovers+1, min_length)
xovers = getCrossoversFromFragments(fragments)
return xovers
def generateRandomFragments(seq_length, num_fragments, min_length):
"""Generates a random partition that splits a sequence into num_fragments
pieces, each of which is no shorter than min_length."""
# We will assume that the inputs have been properly checked
assert(num_fragments*min_length <= seq_length)
# There's a trivial but very hard case which is that the only suitable
# crossover pattern is one which divvies up the protein into fragments
# of equal length. Just return that pattern if it arises.
if num_fragments*min_length == seq_length:
fragments = []
i = 0
for f in range(num_fragments):
fragments.append((i,i+min_length))
i += min_length
return fragments
done = False
while not done:
fragments = [(0,seq_length)]
tries = 0
# It's possible that no suitable set of fragments will be found on any
# given attempt, so we limit the number of tries at generating each
# crossover point.
max_tries = 4*num_fragments
# Challenge: choose num_fragments-1 points randomly along the sequence
# such that none of them lie within min_length of each other or of
# the ends.
crossovers = []
while len(fragments) < num_fragments and not tries > max_tries:
frag = random.choice(fragments)
(start, end) = frag
if end-start > 2*min_length:
# There's enough room for a crossover
crossover = start + splitSequence(end-start, min_length)
# Now replace the whole piece with its two fragments
fragments.remove(frag)
fragments.append((start,crossover))
fragments.append((crossover,end))
if len(fragments) == num_fragments:
done = True
else:
# Failed to find a good crossover pattern this time. Try again.
tries += 1
fragments.sort()
return fragments
def mean(x):
"""Computes the average of a list of numbers."""
if len(x) == 0:
raise ValueError, "Cannot take average of zero-length list"
return sum(x)/float(len(x))