Skip to content

Commit

Permalink
Explain changes from HXB2, as part of #441.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Apr 8, 2019
1 parent df4becd commit fd9e480
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 11 deletions.
2 changes: 1 addition & 1 deletion micall/project_scoring.json
Original file line number Diff line number Diff line change
Expand Up @@ -2400,7 +2400,7 @@
},
{
"coordinate_region": "HIV1B-vpr",
"coordinate_region_length": 78,
"coordinate_region_length": 96,
"key_positions": [],
"min_coverage1": 10,
"min_coverage2": 50,
Expand Down
2 changes: 1 addition & 1 deletion micall/projects.json
Original file line number Diff line number Diff line change
Expand Up @@ -30829,7 +30829,7 @@
"is_nucleotide": false,
"reference": [
"MEQAPEDQGPQREPHNEWTLELLEELKNEAVRHFPRIWLHGLGQHIYETYGDTWAGVEAIIRILQ",
"QLLFIHFQNWVST"
"QLLFIHFRIGCRHSRIGVTRQRRARNGASRS"
],
"seed_group": null
},
Expand Down
79 changes: 70 additions & 9 deletions micall/utils/fetch_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from time import sleep

# noinspection PyUnresolvedReferences
from gotoh import align_it_aa
from gotoh import align_it_aa, align_it

from micall.utils.translation import translate

Expand Down Expand Up @@ -82,16 +82,16 @@ def check_hiv_seeds(project_config, unchecked_ref_names: set):
'ENV')
consensus = sequences['CON_OF_CONS'].replace('-', '').upper()

new_sequences = fetch_alignment_sequences('2015', 'COM')
source_sequences = fetch_alignment_sequences('2015', 'COM')
consensus_accession = 'Consensus'
assert consensus_accession not in new_sequences, sorted(new_sequences.keys())
new_sequences[consensus_accession] = consensus
assert consensus_accession not in source_sequences, sorted(source_sequences.keys())
source_sequences[consensus_accession] = consensus
ref_names = project_config.getProjectSeeds('HIV')
unchecked_ref_names.difference_update(ref_names)

report, error_count = compare_config(ref_names,
project_config,
new_sequences,
source_sequences,
name_part=3)
print(report)
return error_count
Expand All @@ -102,12 +102,28 @@ def check_hiv_coordinates(project_config, unchecked_ref_names: set):
HIV coordinate references come from HXB2 (K03455). That sequence is trimmed
to each of the gene regions, then translated to amino acids. The gene positions
are documented at https://www.hiv.lanl.gov/content/sequence/HIV/REVIEWS/HXB2.html
We replace HXB2's premature stop codon in Nef with a W.
We replace HXB2's premature stop codon in Nef with the codon from Consensus B,
and drop a nucleotide of Vpr to avoid the frame shift.
V3LOOP is a subsection of Env from Consensus B with one nucleotide replaced by
the nucleotide from the overall consensus.
""")
hxb2 = fetch_by_accession('K03455')

consensus_sequences = fetch_alignment_sequences(2004,
'CON', # Consensus/Ancestral
'NEF')
consensus_b_nef = consensus_sequences['CONSENSUS_B'].replace('-', '').upper()

consensus_sequences = fetch_alignment_sequences('2004',
'CON', # Consensus/Ancestral
'ENV')
consensus_b_env = consensus_sequences['CONSENSUS_B'].replace('-', '').upper()
overall_consensus_env = \
consensus_sequences['CON_OF_CONS'].replace('-', '').upper()

region_boundaries = {'HIV1B-gag': (789, 2289),
'PR': (2252, 2549),
'RT': (2549, 4229),
'RT': (2549, 3869), # p51 only
'INT': (4229, 5093),
'HIV1B-vif': (5040, 5616),
'HIV1B-vpr': (5558, 5847),
Expand All @@ -117,9 +133,29 @@ def check_hiv_coordinates(project_config, unchecked_ref_names: set):
source_sequences = {}
for region, (start, stop) in region_boundaries.items():
source_nuc_sequence = hxb2[start:stop]
if region == 'HIV1B-nef':
# Replace premature stop codon with Consensus B.
source_nuc_sequence = (source_nuc_sequence[:369] +
consensus_b_nef[369:372] +
source_nuc_sequence[372:])
elif region == 'HIV1B-vpr':
# Drop nucleotide to avoid frame shift.
source_nuc_sequence = (source_nuc_sequence[:213] +
source_nuc_sequence[214:])
source_sequences[region] = translate(source_nuc_sequence)
nef_seq = source_sequences['HIV1B-nef']
source_sequences['HIV1B-nef'] = nef_seq[:123] + 'W' + nef_seq[124:]

consensus_b_v3_nuc_seq = consensus_b_env[876:981]
overall_v3_nuc_seq = overall_consensus_env[855:960]
consensus_b_v3_amino_seq = translate(consensus_b_v3_nuc_seq)
overall_v3_amino_seq = translate(overall_v3_nuc_seq)
assert 'CTRPNNNTRKSIHIGPGRAFYTTGEIIGDIRQAHC' == consensus_b_v3_amino_seq, consensus_b_v3_amino_seq
assert 'CTRPNNNTRKSIRIGPGQAFYATGDIIGDIRQAHC' == overall_v3_amino_seq, overall_v3_amino_seq
# Difference is here: ^

combined_v3_nuc_seq = (consensus_b_v3_nuc_seq[:63] +
overall_v3_nuc_seq[63] +
consensus_b_v3_nuc_seq[64:])
source_sequences['V3LOOP'] = translate(combined_v3_nuc_seq)

hiv_project = project_config.config['projects']['HIV']
ref_names = {project_region['coordinate_region']
Expand All @@ -133,6 +169,31 @@ def check_hiv_coordinates(project_config, unchecked_ref_names: set):
return error_count


def find_v3_match(env_seq, v3_loop_seq):
for frame in range(3):
env_aminos = translate('-' * frame + env_seq)
aligned_env, aligned_v3, score = align_it_aa(env_aminos,
v3_loop_seq,
GAP_OPEN_COST,
GAP_EXTEND_COST,
USE_TERMINAL_COST)
if score < 200:
continue
offset = (len(aligned_v3) - len(aligned_v3.lstrip('-'))) * 3 + frame
end = offset + len(v3_loop_seq)*3
print(translate(env_seq[offset:end]))
env_section = env_seq[offset + 60:offset + 69]
if env_section == 'TATGCAACA':
print('**G**')
elif env_section == 'TATACAACA':
print('**A**')
else:
print(f'??{env_section}??')
print(env_section)
print(translate(env_section))
print(score)


def find_best_match_for_pssm():
scenarios = (('ENV', '1997 1999 2000 2001 2002 2002JAN 2002AUG 2004'),
('GENOME', '2000 2001 2002'))
Expand Down

0 comments on commit fd9e480

Please sign in to comment.