From 750e48c23912161c325f5605fa135488d62283d8 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 09:00:38 -0500 Subject: [PATCH 1/6] bump viral-core 2.2.3 to 2.2.4 --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 29157c8a..cdd8a2aa 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/broadinstitute/viral-core:2.2.3 +FROM quay.io/broadinstitute/viral-core:2.2.4 LABEL maintainer "viral-ngs@broadinstitute.org" From ea94cc1fe1fde6917f30020a3382095d39b2be00 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 09:21:49 -0500 Subject: [PATCH 2/6] add new option --allow_incomplete_output to order_and_orient --- assembly.py | 11 +++++-- .../contigs.lasv.one_small.fasta | 9 +++++ test/unit/test_assembly.py | 33 ++++++++++++++++++- 3 files changed, 50 insertions(+), 3 deletions(-) create mode 100644 test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta diff --git a/assembly.py b/assembly.py index 031e0ad3..772a65b5 100755 --- a/assembly.py +++ b/assembly.py @@ -428,7 +428,7 @@ def order_and_orient(inFasta, inReference, outFasta, breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None, maxgap=200, minmatch=10, mincluster=None, min_pct_id=0.6, min_contig_len=200, min_pct_contig_aligned=0.3, n_genome_segments=0, - outStats=None, threads=None): + allow_incomplete_output=False, outStats=None, threads=None): ''' This step cleans up the de novo assembly with a known reference genome. Uses MUMmer (nucmer or promer) to create a reference-based consensus sequence of aligned contigs (with runs of N's in between the de novo @@ -481,7 +481,7 @@ def order_and_orient(inFasta, inReference, outFasta, base_counts = [sum([len(seg.seq.ungap('N')) for seg in scaffold]) \ if len(scaffold)==n_genome_segments else 0 for scaffold in scaffolds] best_ref_num = numpy.argmax(base_counts) - if len(scaffolds[best_ref_num]) != n_genome_segments: + if (len(scaffolds[best_ref_num]) != n_genome_segments) and not allow_incomplete_output: raise IncompleteAssemblyError(len(scaffolds[best_ref_num]), n_genome_segments) log.info('base_counts={} best_ref_num={}'.format(base_counts, best_ref_num)) shutil.copyfile(scaffolds_fasta[best_ref_num], outFasta) @@ -525,6 +525,13 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()): parser.add_argument('--outReference', help='Output the reference chosen for scaffolding to this file') parser.add_argument('--outStats', help='Output stats used in reference selection') + parser.add_argument( + "--allow_incomplete_output", + help="Do not fail with IncompleteAssemblyError if we fail to recover all segments of the desired genome.", + default=False, + action="store_true", + dest="allow_incomplete_output" + ) #parser.add_argument('--aligner', # help='nucmer (nucleotide) or promer (six-frame translations) [default: %(default)s]', # choices=['nucmer', 'promer'], diff --git a/test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta b/test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta new file mode 100644 index 00000000..68b9d3ac --- /dev/null +++ b/test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta @@ -0,0 +1,9 @@ +>comp2_c0_seq1 len=421 path=[0:0-420] +GGGTGAGAGTGGTGGGGGTCGGTGGGAGACTCAGGGACTGTAGGGTGGGGGTCTGATGCT +GTCTGCTGCTCTAGTTGGAGGTGCTGTTGGAGTGGCTGATGGTCTCAGCTTTGTGGGGAG +AGGCATCTTGCAAATGGGGCACCTGTTGCTGACACTCAGAAGTAAGGTGAGGCAGTTGAG +ACACAGATAGTGGTTGTTGCACTCAACCAGGCCCTTGTTTTCGAACCAGCAGCTCTTGCA +GAACTGTGGCCCTAGATGTGTGGCATCCGGGATCAGGCTGGCTCTTGGACTGTCTCTTGA +TTCTGGGGCTTTGGATTGCTTGTTTCCCATTTGGAAGAGTTCAGCGTACAAAGCCTCAAA +AAGAAGAACAACCAAATTGCCTAGGATCCCCGGTGCGCGGGTCCATTTTCCTCATTTGTA +G diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index ef18300a..564b650b 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -420,7 +420,38 @@ def test_obscure_mummer3_bug(self): os.path.join(inDir, 'contig.mummer3_fail_lasv.fasta'), os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'), outFasta) - + + def test_not_all_segments_fail(self): + # IncompleteAssemblyError is thrown when only some but not all segments are recovered + inDir = util.file.get_test_input_path(self) + outFasta = util.file.mkstempfname('.fasta') + self.assertRaises(assembly.IncompleteAssemblyError, + assembly.order_and_orient, + os.path.join(inDir, 'contigs.lasv.one_small.fasta'), + os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'), + outFasta) + + def test_not_all_segments_succeed(self): + # ... unless we specifically allow for partial outputs + inDir = util.file.get_test_input_path(self) + outFasta = util.file.mkstempfname('.fasta') + assembly.order_and_orient( + os.path.join(inDir, 'contigs.lasv.one_small.fasta'), + os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'), + outFasta, + allow_incomplete_output=True) + + def test_empty_output_succeed(self): + # a completely empty output should be possible if we allow it + inDir = util.file.get_test_input_path(self) + emptyFasta = os.path.join(inDir, '..', 'empty.fasta') + outFasta = util.file.mkstempfname('.fasta') + assembly.order_and_orient( + emptyFasta, + os.path.join(inDir, 'ref.influenza.fasta'), + outFasta, + allow_incomplete_output=True) + self.assertEqualContents(outFasta, emptyFasta) class TestGap2Seq(TestCaseWithTmp): '''Test gap-filling tool Gap2Seq''' From 79a139f35340bc502a95cccb32ca4fe892818d55 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 09:35:04 -0500 Subject: [PATCH 3/6] add unit tests for empty fasta inputs --- test/unit/test_assembly.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index 564b650b..5a3258aa 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -467,6 +467,15 @@ def test_gapfill(self): shutil.copyfile(filled, '/tmp/filled.fasta') self.assertEqualContents(filled, join(inDir, 'TestGap2Seq', 'expected.ebov.doublehit.gapfill.fasta')) + def test_empty_fasta_input(self): + inDir = util.file.get_test_input_path() + empty_fasta = os.path.join(inDir, '..', 'empty.fasta') + with util.file.mkstempfname(suffix='.fasta') as out_fasta: + assembly.gapfill_gap2seq(in_scaffold=empty_fasta, + in_bam=os.path.join(inDir, 'G5012.3.testreads.bam'), + out_scaffold=out_fasta, random_seed=23923937, threads=1) + self.assertEqualContents(out_fasta, empty_fasta) + class TestImputeFromReference(TestCaseWithTmp): ''' Test the impute_from_reference command (align and modify_contig) ''' @@ -558,6 +567,21 @@ def test_small_mummer(self): str(Bio.SeqIO.read(outFasta, 'fasta').seq), str(Bio.SeqIO.read(expected, 'fasta').seq)) + def test_empty_fasta_input(self): + inDir = util.file.get_test_input_path() + empty_fasta = os.path.join(inDir, '..', 'empty.fasta') + outFasta = util.file.mkstempfname('.fasta') + assembly.impute_from_reference( + empty_fasta, + os.path.join(inDir, 'ref.sub.ebov.fasta'), + outFasta, + minLengthFraction=0.0, + minUnambig=0.0, + replaceLength=5, + newName='test_sub-EBOV.genome', + aligner='mummer') + self.assertEqualContents(outFasta, empty_fasta) + class TestMutableSequence(unittest.TestCase): ''' Test the MutableSequence class ''' From c8002010e63ec5beea295ba8a24cfb296227c4a6 Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 09:46:41 -0500 Subject: [PATCH 4/6] some unit test bugfixing --- test/unit/test_assembly.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index 5a3258aa..04873d08 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -470,11 +470,11 @@ def test_gapfill(self): def test_empty_fasta_input(self): inDir = util.file.get_test_input_path() empty_fasta = os.path.join(inDir, '..', 'empty.fasta') - with util.file.mkstempfname(suffix='.fasta') as out_fasta: - assembly.gapfill_gap2seq(in_scaffold=empty_fasta, - in_bam=os.path.join(inDir, 'G5012.3.testreads.bam'), - out_scaffold=out_fasta, random_seed=23923937, threads=1) - self.assertEqualContents(out_fasta, empty_fasta) + out_fasta = util.file.mkstempfname('.fasta') + assembly.gapfill_gap2seq(in_scaffold=empty_fasta, + in_bam=os.path.join(inDir, 'G5012.3.testreads.bam'), + out_scaffold=out_fasta, random_seed=23923937, threads=1) + self.assertEqualContents(out_fasta, empty_fasta) class TestImputeFromReference(TestCaseWithTmp): @@ -568,7 +568,7 @@ def test_small_mummer(self): str(Bio.SeqIO.read(expected, 'fasta').seq)) def test_empty_fasta_input(self): - inDir = util.file.get_test_input_path() + inDir = util.file.get_test_input_path(self) empty_fasta = os.path.join(inDir, '..', 'empty.fasta') outFasta = util.file.mkstempfname('.fasta') assembly.impute_from_reference( From 096050f148b8f37cbafceda11a36f2dc805ffdcd Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 10:02:45 -0500 Subject: [PATCH 5/6] fix unit test directory --- test/unit/test_assembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py index 04873d08..6f8affd1 100644 --- a/test/unit/test_assembly.py +++ b/test/unit/test_assembly.py @@ -469,7 +469,7 @@ def test_gapfill(self): def test_empty_fasta_input(self): inDir = util.file.get_test_input_path() - empty_fasta = os.path.join(inDir, '..', 'empty.fasta') + empty_fasta = os.path.join(inDir, 'empty.fasta') out_fasta = util.file.mkstempfname('.fasta') assembly.gapfill_gap2seq(in_scaffold=empty_fasta, in_bam=os.path.join(inDir, 'G5012.3.testreads.bam'), From 5400c731f1c67dc2f5c3a3a48f623cd1d67c7eec Mon Sep 17 00:00:00 2001 From: Daniel Park Date: Tue, 9 Jan 2024 10:10:15 -0500 Subject: [PATCH 6/6] make special exception on empty input to pass empty output --- assembly.py | 141 ++++++++++++++++++++++++++-------------------------- 1 file changed, 71 insertions(+), 70 deletions(-) diff --git a/assembly.py b/assembly.py index 772a65b5..1f0a371e 100755 --- a/assembly.py +++ b/assembly.py @@ -663,79 +663,80 @@ def impute_from_reference( with open(inFasta, 'r') as asmFastaFile: with open(inReference, 'r') as refFastaFile: - asmFasta = Bio.SeqIO.parse(asmFastaFile, 'fasta') + asmFasta = list(Bio.SeqIO.parse(asmFastaFile, 'fasta')) refFasta = Bio.SeqIO.parse(refFastaFile, 'fasta') - for idx, (refSeqObj, asmSeqObj) in enumerate(itertools.zip_longest(refFasta, asmFasta)): - # our zip fails if one file has more seqs than the other - if not refSeqObj or not asmSeqObj: - raise KeyError("inFasta and inReference do not have the same number of sequences. This could be because the de novo assembly process was unable to create contigs for all segments.") - - # error if PoorAssembly - minLength = len(refSeqObj) * minLengthFraction - non_n_count = util.misc.unambig_count(asmSeqObj.seq) - seq_len = len(asmSeqObj) - log.info( - "Assembly Quality - segment {idx} - name {segname} - contig len {len_actual} / {len_desired} ({min_frac}) - unambiguous bases {unamb_actual} / {unamb_desired} ({min_unamb})".format( - idx=idx + 1, - segname=refSeqObj.id, - len_actual=seq_len, - len_desired=len(refSeqObj), - min_frac=minLengthFraction, - unamb_actual=non_n_count, - unamb_desired=seq_len, - min_unamb=minUnambig - ) - ) - if seq_len < minLength or non_n_count < seq_len * minUnambig: - raise PoorAssemblyError(idx + 1, seq_len, non_n_count, minLength, len(refSeqObj)) - - # prepare temp input and output files - tmpOutputFile = util.file.mkstempfname(prefix='seq-out-{idx}-'.format(idx=idx), suffix=".fasta") - concat_file = util.file.mkstempfname('.ref_and_actual.fasta') - ref_file = util.file.mkstempfname('.ref.fasta') - actual_file = util.file.mkstempfname('.actual.fasta') - aligned_file = util.file.mkstempfname('.'+aligner+'.fasta') - refName = refSeqObj.id - with open(concat_file, 'wt') as outf: - Bio.SeqIO.write([refSeqObj, asmSeqObj], outf, "fasta") - with open(ref_file, 'wt') as outf: - Bio.SeqIO.write([refSeqObj], outf, "fasta") - with open(actual_file, 'wt') as outf: - Bio.SeqIO.write([asmSeqObj], outf, "fasta") - - # align scaffolded genome to reference (choose one of three aligners) - if aligner == 'mafft': - assemble.mafft.MafftTool().execute( - [ref_file, actual_file], aligned_file, False, True, True, False, False, None + if asmFasta: + for idx, (refSeqObj, asmSeqObj) in enumerate(itertools.zip_longest(refFasta, asmFasta)): + # our zip fails if one file has more seqs than the other + if not refSeqObj or not asmSeqObj: + raise KeyError("inFasta and inReference do not have the same number of sequences. This could be because the de novo assembly process was unable to create contigs for all segments.") + + # error if PoorAssembly + minLength = len(refSeqObj) * minLengthFraction + non_n_count = util.misc.unambig_count(asmSeqObj.seq) + seq_len = len(asmSeqObj) + log.info( + "Assembly Quality - segment {idx} - name {segname} - contig len {len_actual} / {len_desired} ({min_frac}) - unambiguous bases {unamb_actual} / {unamb_desired} ({min_unamb})".format( + idx=idx + 1, + segname=refSeqObj.id, + len_actual=seq_len, + len_desired=len(refSeqObj), + min_frac=minLengthFraction, + unamb_actual=non_n_count, + unamb_desired=seq_len, + min_unamb=minUnambig + ) ) - elif aligner == 'muscle': - if len(refSeqObj) > 40000: - assemble.muscle.MuscleTool().execute( - concat_file, aligned_file, quiet=False, - maxiters=2, diags=True + if seq_len < minLength or non_n_count < seq_len * minUnambig: + raise PoorAssemblyError(idx + 1, seq_len, non_n_count, minLength, len(refSeqObj)) + + # prepare temp input and output files + tmpOutputFile = util.file.mkstempfname(prefix='seq-out-{idx}-'.format(idx=idx), suffix=".fasta") + concat_file = util.file.mkstempfname('.ref_and_actual.fasta') + ref_file = util.file.mkstempfname('.ref.fasta') + actual_file = util.file.mkstempfname('.actual.fasta') + aligned_file = util.file.mkstempfname('.'+aligner+'.fasta') + refName = refSeqObj.id + with open(concat_file, 'wt') as outf: + Bio.SeqIO.write([refSeqObj, asmSeqObj], outf, "fasta") + with open(ref_file, 'wt') as outf: + Bio.SeqIO.write([refSeqObj], outf, "fasta") + with open(actual_file, 'wt') as outf: + Bio.SeqIO.write([asmSeqObj], outf, "fasta") + + # align scaffolded genome to reference (choose one of three aligners) + if aligner == 'mafft': + assemble.mafft.MafftTool().execute( + [ref_file, actual_file], aligned_file, False, True, True, False, False, None ) - else: - assemble.muscle.MuscleTool().execute(concat_file, aligned_file, quiet=False) - elif aligner == 'mummer': - assemble.mummer.MummerTool().align_one_to_one(ref_file, actual_file, aligned_file) - - # run modify_contig - args = [ - aligned_file, tmpOutputFile, refName, '--call-reference-ns', '--trim-ends', '--replace-5ends', - '--replace-3ends', '--replace-length', str(replaceLength), '--replace-end-gaps' - ] - if newName: - # renames the segment name "sampleName-idx" where idx is the segment number - args.extend(['--name', newName + "-" + str(idx + 1)]) - args = pmc.parse_args(args) - args.func_main(args) - - # clean up - os.unlink(concat_file) - os.unlink(ref_file) - os.unlink(actual_file) - os.unlink(aligned_file) - tempFastas.append(tmpOutputFile) + elif aligner == 'muscle': + if len(refSeqObj) > 40000: + assemble.muscle.MuscleTool().execute( + concat_file, aligned_file, quiet=False, + maxiters=2, diags=True + ) + else: + assemble.muscle.MuscleTool().execute(concat_file, aligned_file, quiet=False) + elif aligner == 'mummer': + assemble.mummer.MummerTool().align_one_to_one(ref_file, actual_file, aligned_file) + + # run modify_contig + args = [ + aligned_file, tmpOutputFile, refName, '--call-reference-ns', '--trim-ends', '--replace-5ends', + '--replace-3ends', '--replace-length', str(replaceLength), '--replace-end-gaps' + ] + if newName: + # renames the segment name "sampleName-idx" where idx is the segment number + args.extend(['--name', newName + "-" + str(idx + 1)]) + args = pmc.parse_args(args) + args.func_main(args) + + # clean up + os.unlink(concat_file) + os.unlink(ref_file) + os.unlink(actual_file) + os.unlink(aligned_file) + tempFastas.append(tmpOutputFile) # merge outputs util.file.concat(tempFastas, outFasta)