Skip to content

Commit

Permalink
Check that all HIV seeds are used, as part of #441.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Apr 9, 2019
1 parent fd9e480 commit 1c192fe
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 7 deletions.
6 changes: 3 additions & 3 deletions micall/tests/test_fetch_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_single(self):
>A.B.C.R1
ACTGA
""")
expected_sequences = {'R1': 'ACTGA'}
expected_sequences = {'A.B.C.R1': 'ACTGA'}

sequences = parse_compendium(fasta)

Expand All @@ -53,7 +53,7 @@ def test_multiple_sequences(self):
>X.Y.R2
GATA
""")
expected_sequences = {'R1': 'ACTGA', 'R2': 'GATA'}
expected_sequences = {'A.B.C.R1': 'ACTGA', 'X.Y.R2': 'GATA'}

sequences = parse_compendium(fasta)

Expand All @@ -65,7 +65,7 @@ def test_multiple_lines(self):
ACTGA
TTACA
""")
expected_sequences = {'R1': 'ACTGATTACA'}
expected_sequences = {'A.B.C.R1': 'ACTGATTACA'}

sequences = parse_compendium(fasta)

Expand Down
24 changes: 21 additions & 3 deletions micall/utils/fetch_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,20 @@ def check_hiv_seeds(project_config, unchecked_ref_names: set):
print("""\
HIV seed references come from the HIV Sequence Database Compendium. They are
downloaded from https://www.hiv.lanl.gov/content/sequence/NEWALIGN/align.html
Recombinant references are excluded (names start with a digit).
The HIV1-CON-XX-Consensus-seed is a special case: the consensus of consensuses,
downloaded from the same page with the Consensus/Ancestral alignment type.
See the project_seeds_from_compendium.py script for full details.
""")
sequences = fetch_alignment_sequences(2004,
'CON', # Consensus/Ancestral
'ENV')
consensus = sequences['CON_OF_CONS'].replace('-', '').upper()

source_sequences = fetch_alignment_sequences('2015', 'COM')
compendium_sequences = fetch_alignment_sequences('2015', 'COM')
source_sequences = {ref_name.split('.')[-1]: ref_sequence
for ref_name, ref_sequence in compendium_sequences.items()
if not ref_name[0].isdigit()} # Exclude recombinants.
consensus_accession = 'Consensus'
assert consensus_accession not in source_sequences, sorted(source_sequences.keys())
source_sequences[consensus_accession] = consensus
Expand Down Expand Up @@ -265,7 +270,7 @@ def parse_compendium(fasta):
sequences[key] = ''.join(section)
section = []
if line is not None:
key = line[1:].strip().split('.')[-1]
key = line[1:].strip()

return sequences

Expand All @@ -275,6 +280,8 @@ def fill_report(report):


def compare_config(ref_names, project_config, source_sequences, name_part=None):
source_sequences = dict(source_sequences) # Make a copy.
source_count = len(source_sequences)
differ = Differ()
error_count = 0
matching_references = []
Expand All @@ -287,7 +294,10 @@ def compare_config(ref_names, project_config, source_sequences, name_part=None):
else:
name_parts = ref_name.split('-')
source_name = name_parts[name_part]
source_sequence = source_sequences.get(source_name)
try:
source_sequence = source_sequences.pop(source_name)
except KeyError:
source_sequence = None
source_sequence = source_sequence and source_sequence.replace('-', '')
if source_sequence is None:
error_count += 1
Expand All @@ -314,6 +324,14 @@ def compare_config(ref_names, project_config, source_sequences, name_part=None):
f'{", ".join(missing_references)}')
missing_report = fill_report(missing_report)
report.write(missing_report)
if source_sequences:
unused_count = len(source_sequences)
unused_report = f'ERROR: Left {unused_count} out of {source_count} sequences unused: '
unused_report += ', '.join(sorted(source_sequences))
unused_report = fill_report(unused_report)
report.write(unused_report)
error_count += len(source_sequences)

if diff_report.tell():
report.write('ERROR: changed references:\n')
report.write(diff_report.getvalue())
Expand Down
3 changes: 2 additions & 1 deletion micall/utils/project_seeds_from_compendium.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def main():
'reference': seed_seq,
'seed_group_id': seed_group['id']})
if recombinant_names:
print('Skipped recombinants: ' + ', '.join(sorted(recombinant_names)))
print(f'Skipped {len(recombinant_names)} recombinants:',
', '.join(sorted(recombinant_names)))
if hiv_seeds:
seed_names = sorted(hiv_seeds.keys())
should_delete = True
Expand Down

0 comments on commit 1c192fe

Please sign in to comment.