-
Notifications
You must be signed in to change notification settings - Fork 6
/
QTLSumEditor.py
executable file
·119 lines (108 loc) · 6.78 KB
/
QTLSumEditor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/python
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
print " QTLTools Summary Editor"
print ""
print ""
print "* Written by : Jacco Schaap | [email protected]"
print "* Suggested for by : Sander W. van der Laan | [email protected]"
print "* Last update : 2018-06-15"
print "* Name : QTLSumEditor"
print "* Version : v1.2.2"
print ""
print "* Description : This script will collect r-square between locus and QTL "
print " variant when data is clumped."
print ""
print "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
### created by Jacco Schaap, 17-6-'17
from sys import argv
import gzip
from subprocess import call
nominal = argv[1] # Nominal data, e.g. ${SUMMARY}/${STUDYNAME}_QC_qtlnom_summary.txt.gz
nominal_clump = argv[2] # Clumped nominal data, e.g. ${SUMMARY}/${STUDYNAME}_QC_qtlnom_clumped_summary.txt.gz
permuted = argv[3] # Permuted data, e.g. ${SUMMARY}/${STUDYNAME}_QC_qtlperm_summary.txt.gz
permuted_clump = argv[4] # Clumped permute data, e.g. ${SUMMARY}/${STUDYNAME}_QC_qtlperm_clumped_summary.txt.gz
CLUMPDIR = argv[5] # Directory were clumped data can be found, e.g. ${CLUMPDIR}
"""
so for now the code only works on nominal clumped data. Let's fix it for normal and permuted data.
We wanna see the rsquared between all variants!
Input is summarized data file, (nominal, nominal_clumped, permuted, ?permuted_nominal)
It now works only on nominal data cause this is the input data of the plotter script.
What I need to do:
read also the permuted summary, specify in plotter script
Header nominal file = 'Locus,ProbeID,VARIANT,Chr,BP,OtherAlleleA,CodedAlleleA,MAF,MAC,CAF,HWE,Info,Imputation,N,GeneName,EntrezID,Distance_VARIANT_GENE,Strand,Chr_Gene,GeneTxStart,GeneTxEnd,Beta,SE,Nominal_P,Bonferroni,BenjHoch,Q
Header permuted file = 'Locus,ProbeID,VARIANT,Chr,BP,OtherAlleleA,CodedAlleleA,MAF,MAC,CAF,HWE,Info,Imputation,N,GeneName,EntrezID,Distance_VARIANT_GENE,Strand,Chr_Gene,GeneTxStart,GeneTxEnd,Beta,SE,Nominal_P,Perm_P,ApproxPerm_P,Bonferroni,BenjHoch,Q'
"""
def read_summary():
print '\nStarted with updating (clumped) >> NOMINAL << summary file.\n'
for i in range(1,3):
# print str(i)
print '* Updating data from summary file [ ' + str(argv[i] + ' ].\n')
with gzip.open(argv[i], 'r') as file, gzip.open(argv[i] + '_temp.txt.gz', 'w') as out:
out.write('Locus,ProbeID,VARIANT,RSquare,Chr,BP,OtherAlleleA,CodedAlleleA,MAF,MAC,CAF,HWE,Info,Imputation,N,GeneName,EntrezID,Distance_VARIANT_GENE,Strand,Chr_Gene,GeneTxStart,GeneTxEnd,Beta,SE,Nominal_P,Bonferroni,BenjHoch,Q\n')
for regel in file.readlines():
snp_found = 0
regel = regel.strip('\n')
line = regel.split(',')
locus = line[0]
# print '* DEBUG: updating data for locus: ' + locus + '.\n'
variant = line[2]
if line[0] == 'Locus':
continue
with open(CLUMPDIR + '/ldbuddies_' + locus + '.list', 'r') as ldfile:
# print '* DEBUG: opened ldbuddie file, split on tab: ' + CLUMPDIR + '/ldbuddies_' + locus + '.list.\n'
for snp in ldfile.readlines():
snp = snp.strip('\n')
# print '* DEBUG: reading this variant:' + snp + '.\n'
snp = snp.split('\t')
try:
if snp[1] == variant:
# print '* DEBUG: extracting data from ' + regel + ' and ' + snp[2] + '.\n'
newline = ','.join(line[0:3]) + ',' + snp[2] + ',' + ','.join(line[3:27])
out.write(newline + '\n')
snp_found += 1
except IndexError:
print '*** WARNING *** cannot find this variant:' + snp + '.\n'
continue
if snp_found == 0:
# print '*** WARNING *** ' + str(snp) + ' not found with current rsquared threshold, writing NR.\n'
newline = ','.join(line[0:3]) + ',' + 'NR' + ',' + ','.join(line[3:27])
out.write(newline + '\n')
### Overwrite original summary file with temporary updated one
call(["mv", argv[i] + '_temp.txt.gz', argv[i]])
print '\nStarted with updating (clumped) >> PERMUTATION << summary file.\n'
for i in range(3,5):
# print str(i)
print '* Updating data from summary file [ ' + str(argv[i] + ' ].\n')
with gzip.open(argv[i], 'r') as file, gzip.open(argv[i] + '_temp.txt.gz', 'w') as out:
out.write('Locus,ProbeID,VARIANT,RSquare,Chr,BP,OtherAlleleA,CodedAlleleA,MAF,MAC,CAF,HWE,Info,Imputation,N,GeneName,EntrezID,Distance_VARIANT_GENE,Strand,Chr_Gene,GeneTxStart,GeneTxEnd,Beta,SE,Nominal_P,Perm_P,ApproxPerm_P,Bonferroni,BenjHoch,Q\n')
for regel in file.readlines():
snp_found = 0
regel = regel.strip('\n')
line = regel.split(',')
locus = line[0]
# print '* DEBUG: updating data for locus: ' + locus + '.\n'
variant = line[2]
if line[0] == 'Locus':
continue
with open(CLUMPDIR + '/ldbuddies_' + locus + '.list', 'r') as ldfile:
# print '* DEBUG: opened ldbuddie file, split on tab: ' + CLUMPDIR + '/ldbuddies_' + locus + '.list.\n'
for snp in ldfile.readlines():
snp = snp.strip('\n')
# print '* DEBUG: reading this variant:' + snp + '.\n'
snp = snp.split('\t')
try:
if snp[1] == variant:
# print '* DEBUG: extracting data from ' + regel + ' and ' + snp[2] + '.\n'
newline = ','.join(line[0:3]) + ',' + snp[2] + ',' + ','.join(line[3:29])
out.write(newline + '\n')
snp_found += 1
except IndexError:
print '*** WARNING *** cannot find this variant:' + snp + '.\n'
continue
if snp_found == 0:
# print '*** WARNING *** ' + str(snp) + ' not found with current rsquared threshold, writing NR.\n'
newline = ','.join(line[0:3]) + ',' + 'NR' + ',' + ','.join(line[3:29])
out.write(newline + '\n')
### Overwrite original summary file with temporary updated one
call(["mv", argv[i] + '_temp.txt.gz', argv[i]])
read_summary()