Merge pull request #36 from broadinstitute/dp-scaffold

add --allow_incomplete_output option to order_and_orient
broadinstitute · Jan 9, 2024 · 5a15d84 · 5a15d84
2 parents a5a334b + 5400c73
commit 5a15d84
Show file tree

Hide file tree

Showing 4 changed files with 146 additions and 74 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/broadinstitute/viral-core:2.2.3
+FROM quay.io/broadinstitute/viral-core:2.2.4
 
 LABEL maintainer "[email protected]"
 

diff --git a/assembly.py b/assembly.py
@@ -428,7 +428,7 @@ def order_and_orient(inFasta, inReference, outFasta,
         breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None,
         maxgap=200, minmatch=10, mincluster=None,
         min_pct_id=0.6, min_contig_len=200, min_pct_contig_aligned=0.3, n_genome_segments=0, 
-        outStats=None, threads=None):
+        allow_incomplete_output=False, outStats=None, threads=None):
     ''' This step cleans up the de novo assembly with a known reference genome.
         Uses MUMmer (nucmer or promer) to create a reference-based consensus
         sequence of aligned contigs (with runs of N's in between the de novo
@@ -481,7 +481,7 @@ def order_and_orient(inFasta, inReference, outFasta,
         base_counts = [sum([len(seg.seq.ungap('N')) for seg in scaffold]) \
             if len(scaffold)==n_genome_segments else 0 for scaffold in scaffolds]
         best_ref_num = numpy.argmax(base_counts)
-        if len(scaffolds[best_ref_num]) != n_genome_segments:
+        if (len(scaffolds[best_ref_num]) != n_genome_segments) and not allow_incomplete_output:
             raise IncompleteAssemblyError(len(scaffolds[best_ref_num]), n_genome_segments)
         log.info('base_counts={} best_ref_num={}'.format(base_counts, best_ref_num))
         shutil.copyfile(scaffolds_fasta[best_ref_num], outFasta)
@@ -525,6 +525,13 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()):
 
     parser.add_argument('--outReference', help='Output the reference chosen for scaffolding to this file')
     parser.add_argument('--outStats', help='Output stats used in reference selection')
+    parser.add_argument(
+        "--allow_incomplete_output",
+        help="Do not fail with IncompleteAssemblyError if we fail to recover all segments of the desired genome.",
+        default=False,
+        action="store_true",
+        dest="allow_incomplete_output"
+    )
     #parser.add_argument('--aligner',
     #                    help='nucmer (nucleotide) or promer (six-frame translations) [default: %(default)s]',
     #                    choices=['nucmer', 'promer'],
@@ -656,79 +663,80 @@ def impute_from_reference(
 
     with open(inFasta, 'r') as asmFastaFile:
         with open(inReference, 'r') as refFastaFile:
-            asmFasta = Bio.SeqIO.parse(asmFastaFile, 'fasta')
+            asmFasta = list(Bio.SeqIO.parse(asmFastaFile, 'fasta'))
             refFasta = Bio.SeqIO.parse(refFastaFile, 'fasta')
-            for idx, (refSeqObj, asmSeqObj) in enumerate(itertools.zip_longest(refFasta, asmFasta)):
-                # our zip fails if one file has more seqs than the other
-                if not refSeqObj or not asmSeqObj:
-                    raise KeyError("inFasta and inReference do not have the same number of sequences. This could be because the de novo assembly process was unable to create contigs for all segments.")
-
-                # error if PoorAssembly
-                minLength = len(refSeqObj) * minLengthFraction
-                non_n_count = util.misc.unambig_count(asmSeqObj.seq)
-                seq_len = len(asmSeqObj)
-                log.info(
-                    "Assembly Quality - segment {idx} - name {segname} - contig len {len_actual} / {len_desired} ({min_frac}) - unambiguous bases {unamb_actual} / {unamb_desired} ({min_unamb})".format(
-                        idx=idx + 1,
-                        segname=refSeqObj.id,
-                        len_actual=seq_len,
-                        len_desired=len(refSeqObj),
-                        min_frac=minLengthFraction,
-                        unamb_actual=non_n_count,
-                        unamb_desired=seq_len,
-                        min_unamb=minUnambig
-                    )
-                )
-                if seq_len < minLength or non_n_count < seq_len * minUnambig:
-                    raise PoorAssemblyError(idx + 1, seq_len, non_n_count, minLength, len(refSeqObj))
-
-                # prepare temp input and output files
-                tmpOutputFile = util.file.mkstempfname(prefix='seq-out-{idx}-'.format(idx=idx), suffix=".fasta")
-                concat_file = util.file.mkstempfname('.ref_and_actual.fasta')
-                ref_file = util.file.mkstempfname('.ref.fasta')
-                actual_file = util.file.mkstempfname('.actual.fasta')
-                aligned_file = util.file.mkstempfname('.'+aligner+'.fasta')
-                refName = refSeqObj.id
-                with open(concat_file, 'wt') as outf:
-                    Bio.SeqIO.write([refSeqObj, asmSeqObj], outf, "fasta")
-                with open(ref_file, 'wt') as outf:
-                    Bio.SeqIO.write([refSeqObj], outf, "fasta")
-                with open(actual_file, 'wt') as outf:
-                    Bio.SeqIO.write([asmSeqObj], outf, "fasta")
-
-                # align scaffolded genome to reference (choose one of three aligners)
-                if aligner == 'mafft':
-                    assemble.mafft.MafftTool().execute(
-                        [ref_file, actual_file], aligned_file, False, True, True, False, False, None
+            if asmFasta:
+                for idx, (refSeqObj, asmSeqObj) in enumerate(itertools.zip_longest(refFasta, asmFasta)):
+                    # our zip fails if one file has more seqs than the other
+                    if not refSeqObj or not asmSeqObj:
+                        raise KeyError("inFasta and inReference do not have the same number of sequences. This could be because the de novo assembly process was unable to create contigs for all segments.")
+
+                    # error if PoorAssembly
+                    minLength = len(refSeqObj) * minLengthFraction
+                    non_n_count = util.misc.unambig_count(asmSeqObj.seq)
+                    seq_len = len(asmSeqObj)
+                    log.info(
+                        "Assembly Quality - segment {idx} - name {segname} - contig len {len_actual} / {len_desired} ({min_frac}) - unambiguous bases {unamb_actual} / {unamb_desired} ({min_unamb})".format(
+                            idx=idx + 1,
+                            segname=refSeqObj.id,
+                            len_actual=seq_len,
+                            len_desired=len(refSeqObj),
+                            min_frac=minLengthFraction,
+                            unamb_actual=non_n_count,
+                            unamb_desired=seq_len,
+                            min_unamb=minUnambig
+                        )
                     )
-                elif aligner == 'muscle':
-                    if len(refSeqObj) > 40000:
-                        assemble.muscle.MuscleTool().execute(
-                            concat_file, aligned_file, quiet=False,
-                            maxiters=2, diags=True
+                    if seq_len < minLength or non_n_count < seq_len * minUnambig:
+                        raise PoorAssemblyError(idx + 1, seq_len, non_n_count, minLength, len(refSeqObj))
+
+                    # prepare temp input and output files
+                    tmpOutputFile = util.file.mkstempfname(prefix='seq-out-{idx}-'.format(idx=idx), suffix=".fasta")
+                    concat_file = util.file.mkstempfname('.ref_and_actual.fasta')
+                    ref_file = util.file.mkstempfname('.ref.fasta')
+                    actual_file = util.file.mkstempfname('.actual.fasta')
+                    aligned_file = util.file.mkstempfname('.'+aligner+'.fasta')
+                    refName = refSeqObj.id
+                    with open(concat_file, 'wt') as outf:
+                        Bio.SeqIO.write([refSeqObj, asmSeqObj], outf, "fasta")
+                    with open(ref_file, 'wt') as outf:
+                        Bio.SeqIO.write([refSeqObj], outf, "fasta")
+                    with open(actual_file, 'wt') as outf:
+                        Bio.SeqIO.write([asmSeqObj], outf, "fasta")
+
+                    # align scaffolded genome to reference (choose one of three aligners)
+                    if aligner == 'mafft':
+                        assemble.mafft.MafftTool().execute(
+                            [ref_file, actual_file], aligned_file, False, True, True, False, False, None
                         )
-                    else:
-                        assemble.muscle.MuscleTool().execute(concat_file, aligned_file, quiet=False)
-                elif aligner == 'mummer':
-                    assemble.mummer.MummerTool().align_one_to_one(ref_file, actual_file, aligned_file)
-
-                # run modify_contig
-                args = [
-                    aligned_file, tmpOutputFile, refName, '--call-reference-ns', '--trim-ends', '--replace-5ends',
-                    '--replace-3ends', '--replace-length', str(replaceLength), '--replace-end-gaps'
-                ]
-                if newName:
-                    # renames the segment name "sampleName-idx" where idx is the segment number
-                    args.extend(['--name', newName + "-" + str(idx + 1)])
-                args = pmc.parse_args(args)
-                args.func_main(args)
-
-                # clean up
-                os.unlink(concat_file)
-                os.unlink(ref_file)
-                os.unlink(actual_file)
-                os.unlink(aligned_file)
-                tempFastas.append(tmpOutputFile)
+                    elif aligner == 'muscle':
+                        if len(refSeqObj) > 40000:
+                            assemble.muscle.MuscleTool().execute(
+                                concat_file, aligned_file, quiet=False,
+                                maxiters=2, diags=True
+                            )
+                        else:
+                            assemble.muscle.MuscleTool().execute(concat_file, aligned_file, quiet=False)
+                    elif aligner == 'mummer':
+                        assemble.mummer.MummerTool().align_one_to_one(ref_file, actual_file, aligned_file)
+
+                    # run modify_contig
+                    args = [
+                        aligned_file, tmpOutputFile, refName, '--call-reference-ns', '--trim-ends', '--replace-5ends',
+                        '--replace-3ends', '--replace-length', str(replaceLength), '--replace-end-gaps'
+                    ]
+                    if newName:
+                        # renames the segment name "sampleName-idx" where idx is the segment number
+                        args.extend(['--name', newName + "-" + str(idx + 1)])
+                    args = pmc.parse_args(args)
+                    args.func_main(args)
+
+                    # clean up
+                    os.unlink(concat_file)
+                    os.unlink(ref_file)
+                    os.unlink(actual_file)
+                    os.unlink(aligned_file)
+                    tempFastas.append(tmpOutputFile)
 
     # merge outputs
     util.file.concat(tempFastas, outFasta)

diff --git a/test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta b/test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta
@@ -0,0 +1,9 @@
+>comp2_c0_seq1 len=421 path=[0:0-420]
+GGGTGAGAGTGGTGGGGGTCGGTGGGAGACTCAGGGACTGTAGGGTGGGGGTCTGATGCT
+GTCTGCTGCTCTAGTTGGAGGTGCTGTTGGAGTGGCTGATGGTCTCAGCTTTGTGGGGAG
+AGGCATCTTGCAAATGGGGCACCTGTTGCTGACACTCAGAAGTAAGGTGAGGCAGTTGAG
+ACACAGATAGTGGTTGTTGCACTCAACCAGGCCCTTGTTTTCGAACCAGCAGCTCTTGCA
+GAACTGTGGCCCTAGATGTGTGGCATCCGGGATCAGGCTGGCTCTTGGACTGTCTCTTGA
+TTCTGGGGCTTTGGATTGCTTGTTTCCCATTTGGAAGAGTTCAGCGTACAAAGCCTCAAA
+AAGAAGAACAACCAAATTGCCTAGGATCCCCGGTGCGCGGGTCCATTTTCCTCATTTGTA
+G
diff --git a/test/unit/test_assembly.py b/test/unit/test_assembly.py
@@ -420,7 +420,38 @@ def test_obscure_mummer3_bug(self):
             os.path.join(inDir, 'contig.mummer3_fail_lasv.fasta'),
             os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
             outFasta)
-
+
+    def test_not_all_segments_fail(self):
+        # IncompleteAssemblyError is thrown when only some but not all segments are recovered
+        inDir = util.file.get_test_input_path(self)
+        outFasta = util.file.mkstempfname('.fasta')
+        self.assertRaises(assembly.IncompleteAssemblyError,
+            assembly.order_and_orient,
+            os.path.join(inDir, 'contigs.lasv.one_small.fasta'),
+            os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
+            outFasta)
+
+    def test_not_all_segments_succeed(self):
+        # ... unless we specifically allow for partial outputs
+        inDir = util.file.get_test_input_path(self)
+        outFasta = util.file.mkstempfname('.fasta')
+        assembly.order_and_orient(
+            os.path.join(inDir, 'contigs.lasv.one_small.fasta'),
+            os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
+            outFasta,
+            allow_incomplete_output=True)
+
+    def test_empty_output_succeed(self):
+        # a completely empty output should be possible if we allow it
+        inDir = util.file.get_test_input_path(self)
+        emptyFasta = os.path.join(inDir, '..', 'empty.fasta')
+        outFasta = util.file.mkstempfname('.fasta')
+        assembly.order_and_orient(
+            emptyFasta,
+            os.path.join(inDir, 'ref.influenza.fasta'),
+            outFasta,
+            allow_incomplete_output=True)
+        self.assertEqualContents(outFasta, emptyFasta)
 
 class TestGap2Seq(TestCaseWithTmp):
     '''Test gap-filling tool Gap2Seq'''
@@ -436,6 +467,15 @@ def test_gapfill(self):
             shutil.copyfile(filled, '/tmp/filled.fasta')
             self.assertEqualContents(filled, join(inDir, 'TestGap2Seq', 'expected.ebov.doublehit.gapfill.fasta'))
 
+    def test_empty_fasta_input(self):
+        inDir = util.file.get_test_input_path()
+        empty_fasta = os.path.join(inDir, 'empty.fasta')
+        out_fasta = util.file.mkstempfname('.fasta')
+        assembly.gapfill_gap2seq(in_scaffold=empty_fasta,
+                                    in_bam=os.path.join(inDir, 'G5012.3.testreads.bam'),
+                                    out_scaffold=out_fasta, random_seed=23923937, threads=1)
+        self.assertEqualContents(out_fasta, empty_fasta)
+
 
 class TestImputeFromReference(TestCaseWithTmp):
     ''' Test the impute_from_reference command (align and modify_contig) '''
@@ -527,6 +567,21 @@ def test_small_mummer(self):
             str(Bio.SeqIO.read(outFasta, 'fasta').seq),
             str(Bio.SeqIO.read(expected, 'fasta').seq))
 
+    def test_empty_fasta_input(self):
+        inDir = util.file.get_test_input_path(self)
+        empty_fasta = os.path.join(inDir, '..', 'empty.fasta')
+        outFasta = util.file.mkstempfname('.fasta')
+        assembly.impute_from_reference(
+            empty_fasta,
+            os.path.join(inDir, 'ref.sub.ebov.fasta'),
+            outFasta,
+            minLengthFraction=0.0,
+            minUnambig=0.0,
+            replaceLength=5,
+            newName='test_sub-EBOV.genome',
+            aligner='mummer')
+        self.assertEqualContents(outFasta, empty_fasta)
+
 
 class TestMutableSequence(unittest.TestCase):
     ''' Test the MutableSequence class '''