-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhamonization.py
223 lines (202 loc) · 8.41 KB
/
hamonization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# %%
#import required modules
import gzip
import pandas as pd
from pyensembl import EnsemblRelease
import requests, sys
from liftover import get_lifter
# %%
#setup data - using pandas to leverage vectorization
#https://gatk.broadinstitute.org/hc/en-us/articles/360035890811-Resource-bundle
#https://ftp.ensembl.org/pub/release-110/variation/vcf/homo_sapiens/
def get_vcf_names(vcf_path):
with gzip.open(vcf_path, "rt") as ifile:
for line in ifile:
if line.startswith("#CHROM"):
vcf_names = [x for x in line.split('\t')]
break
ifile.close()
return vcf_names
vcf_file = '1000GENOMES-phase_3.vcf.gz'
names = get_vcf_names(vcf_file)
df = pd.read_csv('ukb_phase1to3_heart_may_2022_pheno10.fastGWA', delimiter='\t')
vcf = pd.read_csv(vcf_file, compression='gzip', comment='#',delim_whitespace=True, header=None, names=names)
server = "https://rest.ensembl.org"
converter = get_lifter('hg19', 'hg38')
# %%
#for testing only
DF = df.copy()
# %% [markdown]
# Step 1: Mapping variant IDs to locations
# - Update base pair location value by mapping rsID using latest Ensembl release Q(‘hm_coordinate_conversion’ = ‘rs’); or
# - if above not possible, liftover base pair location to latest genome build (‘hm_coordinate_conversion’ = ‘lo’); or
# - if above not possible, remove variant from file.
# %%
def rsIDtoEnsembl(row):
#attempt with local file
local_vcf_query = vcf.loc[vcf['ID'] == row['SNP']]
if not local_vcf_query.empty:
return [local_vcf_query['#CHROM'].iloc[0], local_vcf_query['POS'].iloc[0]]
#attempt with ensembl API
ext = '/variation/human/' + row['SNP'] + '?'
r = requests.get(server+ext, headers={ 'Content-Type' : 'application/json'})
if not r.ok:
return [None, None]
decoded = r.json()
if 'failed' in decoded:
return [None, None]
try:
return list(map(int, decoded['mappings'][0]['location'].split('-', 1)[0].split(':', 1)))
except:
print(decoded)
def liftoverM(row):
ret = converter.convert_coordinate(row['CHR'], row['POS'])
if (ret == None or len(ret) == 0):
return [None, None]
return [ret[0][0][3], ret[0][1]]
def mapIDs(row):
[chrom, pos] = rsIDtoEnsembl(row)
if (chrom == None or pos == None):
[chrom, pos] = liftoverM(row)
if (chrom == None or pos == None):
return None
row['#CHROM'] = chrom
row['POS'] = pos
return row
DF = DF[DF.apply(mapIDs, axis=1) != None]
# %% [markdown]
# Step 2: Orientation, part(a): Infer the orientation of palindromic variants
# - Firstly, we randomly select 10% of sites. The effect and other alleles are compared with counterpart alternative and reference alleles in the Ensembl VCF references to identify the strand of the non-palindromic variants (forward or reverse).
# - The forward strand consensus can be calculated by forward/(forward+reverse) or reverse/(forward+reverse). To avoid any possibility of sampling bias:
# - If the rate ≥ 0.995, the following harmonisation steps on the palindromic variants are inferred as on the forward (or reverse) strand;
# - If the rate is the range of (0.995,0.9), this rate is recalculated by all non-palindromic variants in the data. The palindromic variants can be inferred as forward (or reverse) if the recalculated rate > 0.99, otherwise palindromic variants are dropped for harmonisation;
# - If the rate ≤ 0.9, palindromic variants are dropped in the following harmonisation step.
#
#
# %%
class Counter: #create function class for vectorized implementation
def __init__(self, seed):
self.counter = 0
self.valid = seed
def fun(self, l):
f = l[0]
v = l[1]
chrom = self['CHR']
loc = self['POS']
ref = self['A1']
alt = self['A2']
reference = vcf.loc[vcf['CHR'] == chrom and vcf['POS'] == loc]
if reference.empty:
return [self.counter + f, self.valid + v - 1]
else:
if ref == reference['REF'] and alt == reference['ALT']:
return [self.counter + f + 1, self.valid + v]
else:
return [self.counter + f, self.valid + v]
def get_consensus(df):
counter = Counter(len(df.index))
count = df.apply(counter.fun)
forward_consensus = count[0]/(count[0] + count[1])
return forward_consensus
def reverse_complement(seq):
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
reverse_complement = "".join(complement.get(base, base) for base in reversed(seq))
return reverse_complement
def isPalindrome(variant):
return variant == reverse_complement(variant)
def infer_orientation(df):
sample_consenus = get_consensus(df.sample(frac=0.1))
if forward_consensus >= 0.995:
return 1 #forward
elif forward_consensus <= 0.005:
return -1 #reverse
elif forward_consensus > 0.9 or forward_consensus < 0.1:
sample_consenus = get_consensus(df[isPalindrome(df['A2'])])
if forward_consensus >= 0.99:
return 1 #forward
elif forward_consensus <= 0.01:
return -1
else:
return 0 #ambiguous
def harmonize_palindrome(row, orientation):
chrom = row['CHR']
loc = row['POS']
ref = row['A1']
alt = row['A2']
reference = vcf.loc[vcf['CHR'] == chrom and vcf['POS'] == loc]
if orientation == 1:
#assume on forward strand, no need to reverse complement
if ref == alt:
#alleles in correct orientation, no need to do anything
return row
else:
#alleles in flipped orientation, need to flip strand
return flip(row)
elif orientation == -1:
#assume on reverse strand, need to reverse complement
row['A1'] = reverse_complement(ref)
row['A2'] = reverse_complement(alt)
if ref == reverse_complement(alt):
#alleles in correct orientation, no need to do anything
return row
else:
#alleles in flipped orientation, need to flip strand
return flip(row)
else:
#ambiguous, do not harmonize
return None
orientation = infer_orientation(DF)
# %% [markdown]
# Step 2: Orientation, part(b): Using chromosome, base pair location and the effect and other alleles, query each variant against the Ensembl VCF reference to harmonise as appropriate by either:
# - Variant harmonisation: Using chromosome, base pair location and the effect and other alleles, query each variant against the Ensembl VCF reference to harmonise as appropriate by either:
# - keeping record as is because:
# - it is already correctly orientated
# - orientating to reference strand:
# - reverse complement the effect and other alleles
# - flipping the effect and other alleles
# - because the effect and other alleles are flipped in the reference
# - this also means the beta, odds ratio, 95% CI and effect allele frequency are inverted
# - a combination of the orientating and flipping the alleles.
# - replace with NA because:
# - There is no counterpart record in the reference VCF file.
# %%
def flip(row):
row['AF1'] = 1 - row['AF1']
row['BETA'] = -row['BETA']
row['SE'] = row['SE']
row['P'] = row['P']
return row
def harmonize(row):
chrom = row['CHR']
loc = row['POS']
ref = row['A1']
alt = row['A2']
if isPalindrome(alt):
return process_palindrome(row, orientation)
reference = vcf.loc[vcf['CHR'] == chrom and vcf['POS'] == loc]
if reference.empty:
row['A1'] = 'NA'
row['A2'] = 'NA'
return row
reference = reference.iloc[0]
if ref == reference['A1'] and alt == reference['A2']:
return row
elif ref == reference['A2'] and alt == reference['A1']:
#alleles on forward strand, no need to reverse complement
if ref == alt:
#alleles in correct orientation, no need to do anything
return row
else:
#alleles in flipped orientation, need to flip strand
return flip(row)
else:
#alleles on reverse strand, need to reverse complement
row['A1'] = reverse_complement(ref)
row['A2'] = reverse_complement(alt)
if ref == reverse_complement(alt):
#alleles in correct orientation, no need to do anything
return row
else:
#alleles in flipped orientation, need to flip strand
return flip(row)
DF = DF.apply(harmonize, axis=1)