From 1c192fe2472f7a136df5b6431a0640f2f26becdd Mon Sep 17 00:00:00 2001 From: donkirkby Date: Tue, 9 Apr 2019 13:39:32 -0700 Subject: [PATCH] Check that all HIV seeds are used, as part of #441. --- micall/tests/test_fetch_sequences.py | 6 ++--- micall/utils/fetch_sequences.py | 24 ++++++++++++++++--- micall/utils/project_seeds_from_compendium.py | 3 ++- 3 files changed, 26 insertions(+), 7 deletions(-) diff --git a/micall/tests/test_fetch_sequences.py b/micall/tests/test_fetch_sequences.py index eccb91416..48e4c54eb 100644 --- a/micall/tests/test_fetch_sequences.py +++ b/micall/tests/test_fetch_sequences.py @@ -29,7 +29,7 @@ def test_single(self): >A.B.C.R1 ACTGA """) - expected_sequences = {'R1': 'ACTGA'} + expected_sequences = {'A.B.C.R1': 'ACTGA'} sequences = parse_compendium(fasta) @@ -53,7 +53,7 @@ def test_multiple_sequences(self): >X.Y.R2 GATA """) - expected_sequences = {'R1': 'ACTGA', 'R2': 'GATA'} + expected_sequences = {'A.B.C.R1': 'ACTGA', 'X.Y.R2': 'GATA'} sequences = parse_compendium(fasta) @@ -65,7 +65,7 @@ def test_multiple_lines(self): ACTGA TTACA """) - expected_sequences = {'R1': 'ACTGATTACA'} + expected_sequences = {'A.B.C.R1': 'ACTGATTACA'} sequences = parse_compendium(fasta) diff --git a/micall/utils/fetch_sequences.py b/micall/utils/fetch_sequences.py index 947dbcc5c..b32ac0dba 100644 --- a/micall/utils/fetch_sequences.py +++ b/micall/utils/fetch_sequences.py @@ -74,15 +74,20 @@ def check_hiv_seeds(project_config, unchecked_ref_names: set): print("""\ HIV seed references come from the HIV Sequence Database Compendium. They are downloaded from https://www.hiv.lanl.gov/content/sequence/NEWALIGN/align.html +Recombinant references are excluded (names start with a digit). The HIV1-CON-XX-Consensus-seed is a special case: the consensus of consensuses, downloaded from the same page with the Consensus/Ancestral alignment type. +See the project_seeds_from_compendium.py script for full details. """) sequences = fetch_alignment_sequences(2004, 'CON', # Consensus/Ancestral 'ENV') consensus = sequences['CON_OF_CONS'].replace('-', '').upper() - source_sequences = fetch_alignment_sequences('2015', 'COM') + compendium_sequences = fetch_alignment_sequences('2015', 'COM') + source_sequences = {ref_name.split('.')[-1]: ref_sequence + for ref_name, ref_sequence in compendium_sequences.items() + if not ref_name[0].isdigit()} # Exclude recombinants. consensus_accession = 'Consensus' assert consensus_accession not in source_sequences, sorted(source_sequences.keys()) source_sequences[consensus_accession] = consensus @@ -265,7 +270,7 @@ def parse_compendium(fasta): sequences[key] = ''.join(section) section = [] if line is not None: - key = line[1:].strip().split('.')[-1] + key = line[1:].strip() return sequences @@ -275,6 +280,8 @@ def fill_report(report): def compare_config(ref_names, project_config, source_sequences, name_part=None): + source_sequences = dict(source_sequences) # Make a copy. + source_count = len(source_sequences) differ = Differ() error_count = 0 matching_references = [] @@ -287,7 +294,10 @@ def compare_config(ref_names, project_config, source_sequences, name_part=None): else: name_parts = ref_name.split('-') source_name = name_parts[name_part] - source_sequence = source_sequences.get(source_name) + try: + source_sequence = source_sequences.pop(source_name) + except KeyError: + source_sequence = None source_sequence = source_sequence and source_sequence.replace('-', '') if source_sequence is None: error_count += 1 @@ -314,6 +324,14 @@ def compare_config(ref_names, project_config, source_sequences, name_part=None): f'{", ".join(missing_references)}') missing_report = fill_report(missing_report) report.write(missing_report) + if source_sequences: + unused_count = len(source_sequences) + unused_report = f'ERROR: Left {unused_count} out of {source_count} sequences unused: ' + unused_report += ', '.join(sorted(source_sequences)) + unused_report = fill_report(unused_report) + report.write(unused_report) + error_count += len(source_sequences) + if diff_report.tell(): report.write('ERROR: changed references:\n') report.write(diff_report.getvalue()) diff --git a/micall/utils/project_seeds_from_compendium.py b/micall/utils/project_seeds_from_compendium.py index 4a7d631c5..6b3acdaa0 100644 --- a/micall/utils/project_seeds_from_compendium.py +++ b/micall/utils/project_seeds_from_compendium.py @@ -121,7 +121,8 @@ def main(): 'reference': seed_seq, 'seed_group_id': seed_group['id']}) if recombinant_names: - print('Skipped recombinants: ' + ', '.join(sorted(recombinant_names))) + print(f'Skipped {len(recombinant_names)} recombinants:', + ', '.join(sorted(recombinant_names))) if hiv_seeds: seed_names = sorted(hiv_seeds.keys()) should_delete = True