From 30cd8b50b744755ca8aad9fbd82ef914dffea6c0 Mon Sep 17 00:00:00 2001 From: Don Kirkby Date: Mon, 1 Nov 2021 14:34:08 -0700 Subject: [PATCH] Combine aligned sections when they share a reading frame, part of #745. --- micall/tests/test_aln2counts_report.py | 31 +++++++++++++++++++------- micall/tests/test_consensus_aligner.py | 4 ++-- micall/utils/consensus_aligner.py | 13 +++++++---- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/micall/tests/test_aln2counts_report.py b/micall/tests/test_aln2counts_report.py index 0ebb155a1..2cb32244a 100644 --- a/micall/tests/test_aln2counts_report.py +++ b/micall/tests/test_aln2counts_report.py @@ -964,12 +964,17 @@ def test_whole_genome_consensus(default_sequence_report): The given read spans more than 1 region. """ aligned_reads = prepare_reads("""\ -HIV1-B-FR-K03455-seed,15,0,9,1,AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG +HIV1-B-FR-K03455-seed,15,0,9,1,AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACA\ +GGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG """) expected_section = """\ -HIV1-B-FR-K03455-seed,whole genome consensus,15,MAX,601,AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG -HIV1-B-FR-K03455-seed,whole genome consensus,15,0.100,601,AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG""" +HIV1-B-FR-K03455-seed,whole genome consensus,15,MAX,601,\ +AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACA\ +GGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG +HIV1-B-FR-K03455-seed,whole genome consensus,15,0.100,601,\ +AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACA\ +GGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG""" report_file = StringIO() default_sequence_report.write_consensus_stitched_header(report_file) @@ -986,15 +991,25 @@ def test_whole_genome_consensus(default_sequence_report): assert key_report == expected_section -@pytest.mark.skip(reason="Is currently failing") def test_deleted_nucleotide(default_sequence_report): -# this is the lost nucleotide: * aligned_reads = prepare_reads("""\ -HIV1-B-FR-K03455-seed,15,0,10,1,ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG +HIV1-B-FR-K03455-seed,15,0,10,1,ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAA\ +AGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACT\ +TGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGAT\ +GGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG """) + # ^ lost nucleotide is G on the second line from the end, in TGC expected_text = """\ -HIV1-B-FR-K03455-seed,whole genome consensus,15,MAX,5359,ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG -HIV1-B-FR-K03455-seed,whole genome consensus,15,0.100,5359,ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG""" +HIV1-B-FR-K03455-seed,whole genome consensus,15,MAX,5359,\ +ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAA\ +AGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACT\ +TGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGAT\ +GGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG +HIV1-B-FR-K03455-seed,whole genome consensus,15,0.100,5359,\ +ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAA\ +AGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACT\ +TGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGAT\ +GGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG""" report_file = StringIO() default_sequence_report.write_consensus_stitched_header(report_file) default_sequence_report.read(aligned_reads) diff --git a/micall/tests/test_consensus_aligner.py b/micall/tests/test_consensus_aligner.py index eb7db1268..3bd70669e 100644 --- a/micall/tests/test_consensus_aligner.py +++ b/micall/tests/test_consensus_aligner.py @@ -355,8 +355,8 @@ def test_start_contig_close_frame_changes(projects): amino_ref=amino_ref) assert_consensus_nuc_indexes(report_aminos, - list(range(801, 900)) + - list(range(903, 1001)), + list(range(801, 904)) + + list(range(907, 1001)), 790, 2292) diff --git a/micall/utils/consensus_aligner.py b/micall/utils/consensus_aligner.py index cd13bdc6c..afbde6098 100644 --- a/micall/utils/consensus_aligner.py +++ b/micall/utils/consensus_aligner.py @@ -381,8 +381,13 @@ def find_amino_alignments(self, can_align = (MINIMUM_AMINO_ALIGNMENT <= min( prev_alignment.amino_size, next_alignment.amino_size)) - if can_align and (size % 3 != 0 or MAXIMUM_AMINO_GAP < size//3): - # Keep dividing around this indel. + has_frame_shift = (prev_alignment.reading_frame != + next_alignment.reading_frame) + has_big_gap = MAXIMUM_AMINO_GAP < size // 3 + if can_align and (has_frame_shift or has_big_gap): + # Both neighbours are big enough, so we have a choice. + # Either there's a frame shift or a big gap, so keep + # dividing around this indel. continue # Merge the two sections on either side of this indel. amino_sections.pop(i+1) @@ -501,8 +506,8 @@ def build_amino_report(self, repeat_position, skip_position, amino_ref) + has_skipped_nucleotide = False if skip_position is not None: - has_skipped_nucleotide = False reading_frame1 = reading_frame2 = None for alignment in self.amino_alignments: if alignment.ref_end == skip_position: @@ -644,7 +649,7 @@ def update_report_amino(coord_index: int, start_pos: int, repeat_position: int = None, skip_position: int = None, - skipped_nuc = None,): + skipped_nuc=None): report_amino = report_aminos[coord_index] report_amino.seed_amino.add(seed_amino) ref_nuc_pos = coord_index * 3 + start_pos