Skip to content

Commit

Permalink
add new option --allow_incomplete_output to order_and_orient
Browse files Browse the repository at this point in the history
  • Loading branch information
dpark01 committed Jan 9, 2024
1 parent 750e48c commit ea94cc1
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 3 deletions.
11 changes: 9 additions & 2 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def order_and_orient(inFasta, inReference, outFasta,
breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None,
maxgap=200, minmatch=10, mincluster=None,
min_pct_id=0.6, min_contig_len=200, min_pct_contig_aligned=0.3, n_genome_segments=0,
outStats=None, threads=None):
allow_incomplete_output=False, outStats=None, threads=None):
''' This step cleans up the de novo assembly with a known reference genome.
Uses MUMmer (nucmer or promer) to create a reference-based consensus
sequence of aligned contigs (with runs of N's in between the de novo
Expand Down Expand Up @@ -481,7 +481,7 @@ def order_and_orient(inFasta, inReference, outFasta,
base_counts = [sum([len(seg.seq.ungap('N')) for seg in scaffold]) \
if len(scaffold)==n_genome_segments else 0 for scaffold in scaffolds]
best_ref_num = numpy.argmax(base_counts)
if len(scaffolds[best_ref_num]) != n_genome_segments:
if (len(scaffolds[best_ref_num]) != n_genome_segments) and not allow_incomplete_output:
raise IncompleteAssemblyError(len(scaffolds[best_ref_num]), n_genome_segments)
log.info('base_counts={} best_ref_num={}'.format(base_counts, best_ref_num))
shutil.copyfile(scaffolds_fasta[best_ref_num], outFasta)
Expand Down Expand Up @@ -525,6 +525,13 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()):

parser.add_argument('--outReference', help='Output the reference chosen for scaffolding to this file')
parser.add_argument('--outStats', help='Output stats used in reference selection')
parser.add_argument(
"--allow_incomplete_output",
help="Do not fail with IncompleteAssemblyError if we fail to recover all segments of the desired genome.",
default=False,
action="store_true",
dest="allow_incomplete_output"
)
#parser.add_argument('--aligner',
# help='nucmer (nucleotide) or promer (six-frame translations) [default: %(default)s]',
# choices=['nucmer', 'promer'],
Expand Down
9 changes: 9 additions & 0 deletions test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>comp2_c0_seq1 len=421 path=[0:0-420]
GGGTGAGAGTGGTGGGGGTCGGTGGGAGACTCAGGGACTGTAGGGTGGGGGTCTGATGCT
GTCTGCTGCTCTAGTTGGAGGTGCTGTTGGAGTGGCTGATGGTCTCAGCTTTGTGGGGAG
AGGCATCTTGCAAATGGGGCACCTGTTGCTGACACTCAGAAGTAAGGTGAGGCAGTTGAG
ACACAGATAGTGGTTGTTGCACTCAACCAGGCCCTTGTTTTCGAACCAGCAGCTCTTGCA
GAACTGTGGCCCTAGATGTGTGGCATCCGGGATCAGGCTGGCTCTTGGACTGTCTCTTGA
TTCTGGGGCTTTGGATTGCTTGTTTCCCATTTGGAAGAGTTCAGCGTACAAAGCCTCAAA
AAGAAGAACAACCAAATTGCCTAGGATCCCCGGTGCGCGGGTCCATTTTCCTCATTTGTA
G
33 changes: 32 additions & 1 deletion test/unit/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,38 @@ def test_obscure_mummer3_bug(self):
os.path.join(inDir, 'contig.mummer3_fail_lasv.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta)


def test_not_all_segments_fail(self):
# IncompleteAssemblyError is thrown when only some but not all segments are recovered
inDir = util.file.get_test_input_path(self)
outFasta = util.file.mkstempfname('.fasta')
self.assertRaises(assembly.IncompleteAssemblyError,
assembly.order_and_orient,
os.path.join(inDir, 'contigs.lasv.one_small.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta)

def test_not_all_segments_succeed(self):
# ... unless we specifically allow for partial outputs
inDir = util.file.get_test_input_path(self)
outFasta = util.file.mkstempfname('.fasta')
assembly.order_and_orient(
os.path.join(inDir, 'contigs.lasv.one_small.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta,
allow_incomplete_output=True)

def test_empty_output_succeed(self):
# a completely empty output should be possible if we allow it
inDir = util.file.get_test_input_path(self)
emptyFasta = os.path.join(inDir, '..', 'empty.fasta')
outFasta = util.file.mkstempfname('.fasta')
assembly.order_and_orient(
emptyFasta,
os.path.join(inDir, 'ref.influenza.fasta'),
outFasta,
allow_incomplete_output=True)
self.assertEqualContents(outFasta, emptyFasta)

class TestGap2Seq(TestCaseWithTmp):
'''Test gap-filling tool Gap2Seq'''
Expand Down

0 comments on commit ea94cc1

Please sign in to comment.