Skip to content

Commit

Permalink
Combine aligned sections when they share a reading frame, part of #745.
Browse files Browse the repository at this point in the history
  • Loading branch information
donkirkby committed Nov 1, 2021
1 parent ff78e53 commit 30cd8b5
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 14 deletions.
31 changes: 23 additions & 8 deletions micall/tests/test_aln2counts_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -964,12 +964,17 @@ def test_whole_genome_consensus(default_sequence_report):
The given read spans more than 1 region.
"""
aligned_reads = prepare_reads("""\
HIV1-B-FR-K03455-seed,15,0,9,1,AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG
HIV1-B-FR-K03455-seed,15,0,9,1,AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACA\
GGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG
""")

expected_section = """\
HIV1-B-FR-K03455-seed,whole genome consensus,15,MAX,601,AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG
HIV1-B-FR-K03455-seed,whole genome consensus,15,0.100,601,AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACAGGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG"""
HIV1-B-FR-K03455-seed,whole genome consensus,15,MAX,601,\
AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACA\
GGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG
HIV1-B-FR-K03455-seed,whole genome consensus,15,0.100,601,\
AGACCCTTTTAGTCAGTGTGGAAAATCTCTAGCAGTGGCGCCCGAACA\
GGGACCTGAAAGCGAAAGGGAAACCAGAGGAGCTCTCTCGACGCAGGACTCG"""

report_file = StringIO()
default_sequence_report.write_consensus_stitched_header(report_file)
Expand All @@ -986,15 +991,25 @@ def test_whole_genome_consensus(default_sequence_report):
assert key_report == expected_section


@pytest.mark.skip(reason="Is currently failing")
def test_deleted_nucleotide(default_sequence_report):
# this is the lost nucleotide: *
aligned_reads = prepare_reads("""\
HIV1-B-FR-K03455-seed,15,0,10,1,ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG
HIV1-B-FR-K03455-seed,15,0,10,1,ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAA\
AGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACT\
TGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGAT\
GGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG
""")
# ^ lost nucleotide is G on the second line from the end, in TGC
expected_text = """\
HIV1-B-FR-K03455-seed,whole genome consensus,15,MAX,5359,ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG
HIV1-B-FR-K03455-seed,whole genome consensus,15,0.100,5359,ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAAAGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACTTGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGATGGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG"""
HIV1-B-FR-K03455-seed,whole genome consensus,15,MAX,5359,\
ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAA\
AGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACT\
TGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGAT\
GGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG
HIV1-B-FR-K03455-seed,whole genome consensus,15,0.100,5359,\
ATTCATCTGTATTACTTTGACTGTTTTTCAGACTCTGCTATAAGAA\
AGGCCTTATTAGGACACATAGTTAGCCCTAGGTGTGAATATCAAGCAGGACATAACAAGGTAGGATCTCTACAATACT\
TGGCACTGCATGCATTAATAACACCAAAAAAGATAAAGCCACCTTTGCCTAGTGTTACGAAACTGACAGAGGATAGAT\
GGAACAAGCCCCAGAAGACCAAGGGCCACAGAGGGAGCCACACAATGAATGGACACTAG"""
report_file = StringIO()
default_sequence_report.write_consensus_stitched_header(report_file)
default_sequence_report.read(aligned_reads)
Expand Down
4 changes: 2 additions & 2 deletions micall/tests/test_consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,8 +355,8 @@ def test_start_contig_close_frame_changes(projects):
amino_ref=amino_ref)

assert_consensus_nuc_indexes(report_aminos,
list(range(801, 900)) +
list(range(903, 1001)),
list(range(801, 904)) +
list(range(907, 1001)),
790,
2292)

Expand Down
13 changes: 9 additions & 4 deletions micall/utils/consensus_aligner.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,8 +381,13 @@ def find_amino_alignments(self,
can_align = (MINIMUM_AMINO_ALIGNMENT <= min(
prev_alignment.amino_size,
next_alignment.amino_size))
if can_align and (size % 3 != 0 or MAXIMUM_AMINO_GAP < size//3):
# Keep dividing around this indel.
has_frame_shift = (prev_alignment.reading_frame !=
next_alignment.reading_frame)
has_big_gap = MAXIMUM_AMINO_GAP < size // 3
if can_align and (has_frame_shift or has_big_gap):
# Both neighbours are big enough, so we have a choice.
# Either there's a frame shift or a big gap, so keep
# dividing around this indel.
continue
# Merge the two sections on either side of this indel.
amino_sections.pop(i+1)
Expand Down Expand Up @@ -501,8 +506,8 @@ def build_amino_report(self,
repeat_position,
skip_position,
amino_ref)
has_skipped_nucleotide = False
if skip_position is not None:
has_skipped_nucleotide = False
reading_frame1 = reading_frame2 = None
for alignment in self.amino_alignments:
if alignment.ref_end == skip_position:
Expand Down Expand Up @@ -644,7 +649,7 @@ def update_report_amino(coord_index: int,
start_pos: int,
repeat_position: int = None,
skip_position: int = None,
skipped_nuc = None,):
skipped_nuc=None):
report_amino = report_aminos[coord_index]
report_amino.seed_amino.add(seed_amino)
ref_nuc_pos = coord_index * 3 + start_pos
Expand Down

0 comments on commit 30cd8b5

Please sign in to comment.