-
Notifications
You must be signed in to change notification settings - Fork 1
/
Pi_MS_TajD_perBp.py
186 lines (130 loc) · 4.33 KB
/
Pi_MS_TajD_perBp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
import sys
from optparse import OptionParser
import copy
######################
def clusterHaplotypes(inFile, outFile):
#Every 10Kb, calculate Pi
nSam=sample_size +1
lineNumber = 1
SNP_count = 0
flies = initialize()
for line in inFile:
coord = float(line.split(',')[0])
SNP_count +=1
#add to the flies vector
for i in range(1,nSam):
flies[i].append(line.split(',')[i])
SNP_count=SNP_count/locus_length
# calculate pi
Pi=calcPi_2(flies)
Pi=Pi/locus_length
avgPairwiseDiff=calcPi(flies)
avgPairwiseDiff=avgPairwiseDiff/locus_length
TajD = calcTajD(SNP_count, avgPairwiseDiff, flies)
outFile.write(str(SNP_count) + '\t' + str(Pi) + '\t' + str(avgPairwiseDiff) +'\t' + str(TajD) + '\n' )
def initialize():
nSam = sample_size +1
flies={}
for i in range(1,nSam):
flies[i] = []
return flies
def calcPi_2(flies):
# in this definition I will calculate allele frequencies and return a vector of the frequencies in the given population
nSam=sample_size + 1
frequencies = []
for j in range(0, len(flies[1])):
nucleotides = [0,0,0,0]
for i in range(1,nSam):
if flies[i][j] == 'A':
nucleotides[0] +=1
if flies[i][j] == 'T':
nucleotides[1] +=1
if flies[i][j] == 'G':
nucleotides[2] +=1
if flies[i][j] == 'C':
nucleotides[3] +=1
# check whether or not this SNP has exactly two alleles
counter=0
for y in range(0, len(nucleotides)):
if nucleotides[y]>0:
counter +=1
if counter == 2:
frequencies.append(float(max(nucleotides))/sum(nucleotides))
# Now iterate through the frequencies vector and calculate pi
Pi=0
for w in range(0, len(frequencies)):
Pi += 2*frequencies[w]*(1-frequencies[w])
Pi = Pi*(sample_size)/(float(sample_size-1))
# print frequencies
# print Pi
return Pi
def calcPi(flies):
# caluclate the average pairwise difference in SNPs
# make a nested for loop and calculate the pairwise difference between each pair of strains
nSam = sample_size +1
pairwiseDiffArray = []
numComparisons = 0
for i in range(1, nSam-1):
for j in range(i+1,nSam):
diff = pairwiseDiff(flies, i, j)
pairwiseDiffArray.append(diff)
numComparisons +=1
avgPairwiseDiff = float(sum(pairwiseDiffArray))/float(numComparisons)
return avgPairwiseDiff
def pairwiseDiff(flies, i, j):
diff=0
for w in range(0, len(flies[i])):
if flies[i][w] != flies[j][w] and flies[i][w] != 'N' and flies[j][w] != 'N':
diff+=1
return diff
def calcTajD(numSNPs, avgPairwiseDiff, flies):
# Need to calculate the normalizing constant
# a1
a1=0
a2=0
n=sample_size
for i in range(1, n):
a1 += 1/float(i)
a2 += 1/(float(i))**2
b1 = (n+1)/float((3*(n-1)))
b2 = 2*(n**2 + n + 3)/float((9*n*(n-1)))
c1 = b1 - 1/float(a1)
c2 = b2 - float(n+2)/float(a1*n) + float(a2)/float(a1)**2
e1 =c1/float(a1)
e2 = c2/float(a1**2 + a2)
if numSNPs > 0:
TajD = (avgPairwiseDiff - numSNPs/float(a1))/(e1*numSNPs + e2*numSNPs*(numSNPs-1))**0.5
else:
TajD =0
return TajD
###############
def mkOptionParser():
""" Defines options and returns parser """
usage = """%prog <input.bed> <output.bed> <threshold>
%prog filters out the lines that don't meet a certain threshold. """
parser = OptionParser(usage)
return parser
def main():
""" see usage in mkOptionParser. """
parser = mkOptionParser()
options, args= parser.parse_args()
if len(args) != 4:
parser.error("Incorrect number of arguments")
inFN = args[0]
outFN = args[1]
global sample_size
sample_size = int(args[2])
global locus_length
locus_length = float(args[3])
if inFN == '-':
inFile = sys.stdin
else:
inFile = open(inFN, 'r')
if outFN == '-':
outFile = sys.stdout
else:
outFile = open(outFN, 'w')
clusterHaplotypes(inFile, outFile)
#run main
if __name__ == '__main__':
main()