Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add --allow_incomplete_output option to order_and_orient #36

Merged
merged 6 commits into from
Jan 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM quay.io/broadinstitute/viral-core:2.2.3
FROM quay.io/broadinstitute/viral-core:2.2.4

LABEL maintainer "[email protected]"

Expand Down
152 changes: 80 additions & 72 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def order_and_orient(inFasta, inReference, outFasta,
breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None,
maxgap=200, minmatch=10, mincluster=None,
min_pct_id=0.6, min_contig_len=200, min_pct_contig_aligned=0.3, n_genome_segments=0,
outStats=None, threads=None):
allow_incomplete_output=False, outStats=None, threads=None):
''' This step cleans up the de novo assembly with a known reference genome.
Uses MUMmer (nucmer or promer) to create a reference-based consensus
sequence of aligned contigs (with runs of N's in between the de novo
Expand Down Expand Up @@ -481,7 +481,7 @@ def order_and_orient(inFasta, inReference, outFasta,
base_counts = [sum([len(seg.seq.ungap('N')) for seg in scaffold]) \
if len(scaffold)==n_genome_segments else 0 for scaffold in scaffolds]
best_ref_num = numpy.argmax(base_counts)
if len(scaffolds[best_ref_num]) != n_genome_segments:
if (len(scaffolds[best_ref_num]) != n_genome_segments) and not allow_incomplete_output:
raise IncompleteAssemblyError(len(scaffolds[best_ref_num]), n_genome_segments)
log.info('base_counts={} best_ref_num={}'.format(base_counts, best_ref_num))
shutil.copyfile(scaffolds_fasta[best_ref_num], outFasta)
Expand Down Expand Up @@ -525,6 +525,13 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()):

parser.add_argument('--outReference', help='Output the reference chosen for scaffolding to this file')
parser.add_argument('--outStats', help='Output stats used in reference selection')
parser.add_argument(
"--allow_incomplete_output",
help="Do not fail with IncompleteAssemblyError if we fail to recover all segments of the desired genome.",
default=False,
action="store_true",
dest="allow_incomplete_output"
)
#parser.add_argument('--aligner',
# help='nucmer (nucleotide) or promer (six-frame translations) [default: %(default)s]',
# choices=['nucmer', 'promer'],
Expand Down Expand Up @@ -656,79 +663,80 @@ def impute_from_reference(

with open(inFasta, 'r') as asmFastaFile:
with open(inReference, 'r') as refFastaFile:
asmFasta = Bio.SeqIO.parse(asmFastaFile, 'fasta')
asmFasta = list(Bio.SeqIO.parse(asmFastaFile, 'fasta'))
refFasta = Bio.SeqIO.parse(refFastaFile, 'fasta')
for idx, (refSeqObj, asmSeqObj) in enumerate(itertools.zip_longest(refFasta, asmFasta)):
# our zip fails if one file has more seqs than the other
if not refSeqObj or not asmSeqObj:
raise KeyError("inFasta and inReference do not have the same number of sequences. This could be because the de novo assembly process was unable to create contigs for all segments.")

# error if PoorAssembly
minLength = len(refSeqObj) * minLengthFraction
non_n_count = util.misc.unambig_count(asmSeqObj.seq)
seq_len = len(asmSeqObj)
log.info(
"Assembly Quality - segment {idx} - name {segname} - contig len {len_actual} / {len_desired} ({min_frac}) - unambiguous bases {unamb_actual} / {unamb_desired} ({min_unamb})".format(
idx=idx + 1,
segname=refSeqObj.id,
len_actual=seq_len,
len_desired=len(refSeqObj),
min_frac=minLengthFraction,
unamb_actual=non_n_count,
unamb_desired=seq_len,
min_unamb=minUnambig
)
)
if seq_len < minLength or non_n_count < seq_len * minUnambig:
raise PoorAssemblyError(idx + 1, seq_len, non_n_count, minLength, len(refSeqObj))

# prepare temp input and output files
tmpOutputFile = util.file.mkstempfname(prefix='seq-out-{idx}-'.format(idx=idx), suffix=".fasta")
concat_file = util.file.mkstempfname('.ref_and_actual.fasta')
ref_file = util.file.mkstempfname('.ref.fasta')
actual_file = util.file.mkstempfname('.actual.fasta')
aligned_file = util.file.mkstempfname('.'+aligner+'.fasta')
refName = refSeqObj.id
with open(concat_file, 'wt') as outf:
Bio.SeqIO.write([refSeqObj, asmSeqObj], outf, "fasta")
with open(ref_file, 'wt') as outf:
Bio.SeqIO.write([refSeqObj], outf, "fasta")
with open(actual_file, 'wt') as outf:
Bio.SeqIO.write([asmSeqObj], outf, "fasta")

# align scaffolded genome to reference (choose one of three aligners)
if aligner == 'mafft':
assemble.mafft.MafftTool().execute(
[ref_file, actual_file], aligned_file, False, True, True, False, False, None
if asmFasta:
for idx, (refSeqObj, asmSeqObj) in enumerate(itertools.zip_longest(refFasta, asmFasta)):
# our zip fails if one file has more seqs than the other
if not refSeqObj or not asmSeqObj:
raise KeyError("inFasta and inReference do not have the same number of sequences. This could be because the de novo assembly process was unable to create contigs for all segments.")

# error if PoorAssembly
minLength = len(refSeqObj) * minLengthFraction
non_n_count = util.misc.unambig_count(asmSeqObj.seq)
seq_len = len(asmSeqObj)
log.info(
"Assembly Quality - segment {idx} - name {segname} - contig len {len_actual} / {len_desired} ({min_frac}) - unambiguous bases {unamb_actual} / {unamb_desired} ({min_unamb})".format(
idx=idx + 1,
segname=refSeqObj.id,
len_actual=seq_len,
len_desired=len(refSeqObj),
min_frac=minLengthFraction,
unamb_actual=non_n_count,
unamb_desired=seq_len,
min_unamb=minUnambig
)
)
elif aligner == 'muscle':
if len(refSeqObj) > 40000:
assemble.muscle.MuscleTool().execute(
concat_file, aligned_file, quiet=False,
maxiters=2, diags=True
if seq_len < minLength or non_n_count < seq_len * minUnambig:
raise PoorAssemblyError(idx + 1, seq_len, non_n_count, minLength, len(refSeqObj))

# prepare temp input and output files
tmpOutputFile = util.file.mkstempfname(prefix='seq-out-{idx}-'.format(idx=idx), suffix=".fasta")
concat_file = util.file.mkstempfname('.ref_and_actual.fasta')
ref_file = util.file.mkstempfname('.ref.fasta')
actual_file = util.file.mkstempfname('.actual.fasta')
aligned_file = util.file.mkstempfname('.'+aligner+'.fasta')
refName = refSeqObj.id
with open(concat_file, 'wt') as outf:
Bio.SeqIO.write([refSeqObj, asmSeqObj], outf, "fasta")
with open(ref_file, 'wt') as outf:
Bio.SeqIO.write([refSeqObj], outf, "fasta")
with open(actual_file, 'wt') as outf:
Bio.SeqIO.write([asmSeqObj], outf, "fasta")

# align scaffolded genome to reference (choose one of three aligners)
if aligner == 'mafft':
assemble.mafft.MafftTool().execute(
[ref_file, actual_file], aligned_file, False, True, True, False, False, None
)
else:
assemble.muscle.MuscleTool().execute(concat_file, aligned_file, quiet=False)
elif aligner == 'mummer':
assemble.mummer.MummerTool().align_one_to_one(ref_file, actual_file, aligned_file)

# run modify_contig
args = [
aligned_file, tmpOutputFile, refName, '--call-reference-ns', '--trim-ends', '--replace-5ends',
'--replace-3ends', '--replace-length', str(replaceLength), '--replace-end-gaps'
]
if newName:
# renames the segment name "sampleName-idx" where idx is the segment number
args.extend(['--name', newName + "-" + str(idx + 1)])
args = pmc.parse_args(args)
args.func_main(args)

# clean up
os.unlink(concat_file)
os.unlink(ref_file)
os.unlink(actual_file)
os.unlink(aligned_file)
tempFastas.append(tmpOutputFile)
elif aligner == 'muscle':
if len(refSeqObj) > 40000:
assemble.muscle.MuscleTool().execute(
concat_file, aligned_file, quiet=False,
maxiters=2, diags=True
)
else:
assemble.muscle.MuscleTool().execute(concat_file, aligned_file, quiet=False)
elif aligner == 'mummer':
assemble.mummer.MummerTool().align_one_to_one(ref_file, actual_file, aligned_file)

# run modify_contig
args = [
aligned_file, tmpOutputFile, refName, '--call-reference-ns', '--trim-ends', '--replace-5ends',
'--replace-3ends', '--replace-length', str(replaceLength), '--replace-end-gaps'
]
if newName:
# renames the segment name "sampleName-idx" where idx is the segment number
args.extend(['--name', newName + "-" + str(idx + 1)])
args = pmc.parse_args(args)
args.func_main(args)

# clean up
os.unlink(concat_file)
os.unlink(ref_file)
os.unlink(actual_file)
os.unlink(aligned_file)
tempFastas.append(tmpOutputFile)

# merge outputs
util.file.concat(tempFastas, outFasta)
Expand Down
9 changes: 9 additions & 0 deletions test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>comp2_c0_seq1 len=421 path=[0:0-420]
GGGTGAGAGTGGTGGGGGTCGGTGGGAGACTCAGGGACTGTAGGGTGGGGGTCTGATGCT
GTCTGCTGCTCTAGTTGGAGGTGCTGTTGGAGTGGCTGATGGTCTCAGCTTTGTGGGGAG
AGGCATCTTGCAAATGGGGCACCTGTTGCTGACACTCAGAAGTAAGGTGAGGCAGTTGAG
ACACAGATAGTGGTTGTTGCACTCAACCAGGCCCTTGTTTTCGAACCAGCAGCTCTTGCA
GAACTGTGGCCCTAGATGTGTGGCATCCGGGATCAGGCTGGCTCTTGGACTGTCTCTTGA
TTCTGGGGCTTTGGATTGCTTGTTTCCCATTTGGAAGAGTTCAGCGTACAAAGCCTCAAA
AAGAAGAACAACCAAATTGCCTAGGATCCCCGGTGCGCGGGTCCATTTTCCTCATTTGTA
G
57 changes: 56 additions & 1 deletion test/unit/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,38 @@ def test_obscure_mummer3_bug(self):
os.path.join(inDir, 'contig.mummer3_fail_lasv.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta)


def test_not_all_segments_fail(self):
# IncompleteAssemblyError is thrown when only some but not all segments are recovered
inDir = util.file.get_test_input_path(self)
outFasta = util.file.mkstempfname('.fasta')
self.assertRaises(assembly.IncompleteAssemblyError,
assembly.order_and_orient,
os.path.join(inDir, 'contigs.lasv.one_small.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta)

def test_not_all_segments_succeed(self):
# ... unless we specifically allow for partial outputs
inDir = util.file.get_test_input_path(self)
outFasta = util.file.mkstempfname('.fasta')
assembly.order_and_orient(
os.path.join(inDir, 'contigs.lasv.one_small.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta,
allow_incomplete_output=True)

def test_empty_output_succeed(self):
# a completely empty output should be possible if we allow it
inDir = util.file.get_test_input_path(self)
emptyFasta = os.path.join(inDir, '..', 'empty.fasta')
outFasta = util.file.mkstempfname('.fasta')
assembly.order_and_orient(
emptyFasta,
os.path.join(inDir, 'ref.influenza.fasta'),
outFasta,
allow_incomplete_output=True)
self.assertEqualContents(outFasta, emptyFasta)

class TestGap2Seq(TestCaseWithTmp):
'''Test gap-filling tool Gap2Seq'''
Expand All @@ -436,6 +467,15 @@ def test_gapfill(self):
shutil.copyfile(filled, '/tmp/filled.fasta')
self.assertEqualContents(filled, join(inDir, 'TestGap2Seq', 'expected.ebov.doublehit.gapfill.fasta'))

def test_empty_fasta_input(self):
inDir = util.file.get_test_input_path()
empty_fasta = os.path.join(inDir, 'empty.fasta')
out_fasta = util.file.mkstempfname('.fasta')
assembly.gapfill_gap2seq(in_scaffold=empty_fasta,
in_bam=os.path.join(inDir, 'G5012.3.testreads.bam'),
out_scaffold=out_fasta, random_seed=23923937, threads=1)
self.assertEqualContents(out_fasta, empty_fasta)


class TestImputeFromReference(TestCaseWithTmp):
''' Test the impute_from_reference command (align and modify_contig) '''
Expand Down Expand Up @@ -527,6 +567,21 @@ def test_small_mummer(self):
str(Bio.SeqIO.read(outFasta, 'fasta').seq),
str(Bio.SeqIO.read(expected, 'fasta').seq))

def test_empty_fasta_input(self):
inDir = util.file.get_test_input_path(self)
empty_fasta = os.path.join(inDir, '..', 'empty.fasta')
outFasta = util.file.mkstempfname('.fasta')
assembly.impute_from_reference(
empty_fasta,
os.path.join(inDir, 'ref.sub.ebov.fasta'),
outFasta,
minLengthFraction=0.0,
minUnambig=0.0,
replaceLength=5,
newName='test_sub-EBOV.genome',
aligner='mummer')
self.assertEqualContents(outFasta, empty_fasta)


class TestMutableSequence(unittest.TestCase):
''' Test the MutableSequence class '''
Expand Down