Skip to content

Commit

Permalink
Merge pull request #36 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
add --allow_incomplete_output option to order_and_orient
  • Loading branch information
dpark01 authored Jan 9, 2024
2 parents a5a334b + 5400c73 commit 5a15d84
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 74 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM quay.io/broadinstitute/viral-core:2.2.3
FROM quay.io/broadinstitute/viral-core:2.2.4

LABEL maintainer "[email protected]"

Expand Down
152 changes: 80 additions & 72 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,7 @@ def order_and_orient(inFasta, inReference, outFasta,
breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None,
maxgap=200, minmatch=10, mincluster=None,
min_pct_id=0.6, min_contig_len=200, min_pct_contig_aligned=0.3, n_genome_segments=0,
outStats=None, threads=None):
allow_incomplete_output=False, outStats=None, threads=None):
''' This step cleans up the de novo assembly with a known reference genome.
Uses MUMmer (nucmer or promer) to create a reference-based consensus
sequence of aligned contigs (with runs of N's in between the de novo
Expand Down Expand Up @@ -481,7 +481,7 @@ def order_and_orient(inFasta, inReference, outFasta,
base_counts = [sum([len(seg.seq.ungap('N')) for seg in scaffold]) \
if len(scaffold)==n_genome_segments else 0 for scaffold in scaffolds]
best_ref_num = numpy.argmax(base_counts)
if len(scaffolds[best_ref_num]) != n_genome_segments:
if (len(scaffolds[best_ref_num]) != n_genome_segments) and not allow_incomplete_output:
raise IncompleteAssemblyError(len(scaffolds[best_ref_num]), n_genome_segments)
log.info('base_counts={} best_ref_num={}'.format(base_counts, best_ref_num))
shutil.copyfile(scaffolds_fasta[best_ref_num], outFasta)
Expand Down Expand Up @@ -525,6 +525,13 @@ def parser_order_and_orient(parser=argparse.ArgumentParser()):

parser.add_argument('--outReference', help='Output the reference chosen for scaffolding to this file')
parser.add_argument('--outStats', help='Output stats used in reference selection')
parser.add_argument(
"--allow_incomplete_output",
help="Do not fail with IncompleteAssemblyError if we fail to recover all segments of the desired genome.",
default=False,
action="store_true",
dest="allow_incomplete_output"
)
#parser.add_argument('--aligner',
# help='nucmer (nucleotide) or promer (six-frame translations) [default: %(default)s]',
# choices=['nucmer', 'promer'],
Expand Down Expand Up @@ -656,79 +663,80 @@ def impute_from_reference(

with open(inFasta, 'r') as asmFastaFile:
with open(inReference, 'r') as refFastaFile:
asmFasta = Bio.SeqIO.parse(asmFastaFile, 'fasta')
asmFasta = list(Bio.SeqIO.parse(asmFastaFile, 'fasta'))
refFasta = Bio.SeqIO.parse(refFastaFile, 'fasta')
for idx, (refSeqObj, asmSeqObj) in enumerate(itertools.zip_longest(refFasta, asmFasta)):
# our zip fails if one file has more seqs than the other
if not refSeqObj or not asmSeqObj:
raise KeyError("inFasta and inReference do not have the same number of sequences. This could be because the de novo assembly process was unable to create contigs for all segments.")

# error if PoorAssembly
minLength = len(refSeqObj) * minLengthFraction
non_n_count = util.misc.unambig_count(asmSeqObj.seq)
seq_len = len(asmSeqObj)
log.info(
"Assembly Quality - segment {idx} - name {segname} - contig len {len_actual} / {len_desired} ({min_frac}) - unambiguous bases {unamb_actual} / {unamb_desired} ({min_unamb})".format(
idx=idx + 1,
segname=refSeqObj.id,
len_actual=seq_len,
len_desired=len(refSeqObj),
min_frac=minLengthFraction,
unamb_actual=non_n_count,
unamb_desired=seq_len,
min_unamb=minUnambig
)
)
if seq_len < minLength or non_n_count < seq_len * minUnambig:
raise PoorAssemblyError(idx + 1, seq_len, non_n_count, minLength, len(refSeqObj))

# prepare temp input and output files
tmpOutputFile = util.file.mkstempfname(prefix='seq-out-{idx}-'.format(idx=idx), suffix=".fasta")
concat_file = util.file.mkstempfname('.ref_and_actual.fasta')
ref_file = util.file.mkstempfname('.ref.fasta')
actual_file = util.file.mkstempfname('.actual.fasta')
aligned_file = util.file.mkstempfname('.'+aligner+'.fasta')
refName = refSeqObj.id
with open(concat_file, 'wt') as outf:
Bio.SeqIO.write([refSeqObj, asmSeqObj], outf, "fasta")
with open(ref_file, 'wt') as outf:
Bio.SeqIO.write([refSeqObj], outf, "fasta")
with open(actual_file, 'wt') as outf:
Bio.SeqIO.write([asmSeqObj], outf, "fasta")

# align scaffolded genome to reference (choose one of three aligners)
if aligner == 'mafft':
assemble.mafft.MafftTool().execute(
[ref_file, actual_file], aligned_file, False, True, True, False, False, None
if asmFasta:
for idx, (refSeqObj, asmSeqObj) in enumerate(itertools.zip_longest(refFasta, asmFasta)):
# our zip fails if one file has more seqs than the other
if not refSeqObj or not asmSeqObj:
raise KeyError("inFasta and inReference do not have the same number of sequences. This could be because the de novo assembly process was unable to create contigs for all segments.")

# error if PoorAssembly
minLength = len(refSeqObj) * minLengthFraction
non_n_count = util.misc.unambig_count(asmSeqObj.seq)
seq_len = len(asmSeqObj)
log.info(
"Assembly Quality - segment {idx} - name {segname} - contig len {len_actual} / {len_desired} ({min_frac}) - unambiguous bases {unamb_actual} / {unamb_desired} ({min_unamb})".format(
idx=idx + 1,
segname=refSeqObj.id,
len_actual=seq_len,
len_desired=len(refSeqObj),
min_frac=minLengthFraction,
unamb_actual=non_n_count,
unamb_desired=seq_len,
min_unamb=minUnambig
)
)
elif aligner == 'muscle':
if len(refSeqObj) > 40000:
assemble.muscle.MuscleTool().execute(
concat_file, aligned_file, quiet=False,
maxiters=2, diags=True
if seq_len < minLength or non_n_count < seq_len * minUnambig:
raise PoorAssemblyError(idx + 1, seq_len, non_n_count, minLength, len(refSeqObj))

# prepare temp input and output files
tmpOutputFile = util.file.mkstempfname(prefix='seq-out-{idx}-'.format(idx=idx), suffix=".fasta")
concat_file = util.file.mkstempfname('.ref_and_actual.fasta')
ref_file = util.file.mkstempfname('.ref.fasta')
actual_file = util.file.mkstempfname('.actual.fasta')
aligned_file = util.file.mkstempfname('.'+aligner+'.fasta')
refName = refSeqObj.id
with open(concat_file, 'wt') as outf:
Bio.SeqIO.write([refSeqObj, asmSeqObj], outf, "fasta")
with open(ref_file, 'wt') as outf:
Bio.SeqIO.write([refSeqObj], outf, "fasta")
with open(actual_file, 'wt') as outf:
Bio.SeqIO.write([asmSeqObj], outf, "fasta")

# align scaffolded genome to reference (choose one of three aligners)
if aligner == 'mafft':
assemble.mafft.MafftTool().execute(
[ref_file, actual_file], aligned_file, False, True, True, False, False, None
)
else:
assemble.muscle.MuscleTool().execute(concat_file, aligned_file, quiet=False)
elif aligner == 'mummer':
assemble.mummer.MummerTool().align_one_to_one(ref_file, actual_file, aligned_file)

# run modify_contig
args = [
aligned_file, tmpOutputFile, refName, '--call-reference-ns', '--trim-ends', '--replace-5ends',
'--replace-3ends', '--replace-length', str(replaceLength), '--replace-end-gaps'
]
if newName:
# renames the segment name "sampleName-idx" where idx is the segment number
args.extend(['--name', newName + "-" + str(idx + 1)])
args = pmc.parse_args(args)
args.func_main(args)

# clean up
os.unlink(concat_file)
os.unlink(ref_file)
os.unlink(actual_file)
os.unlink(aligned_file)
tempFastas.append(tmpOutputFile)
elif aligner == 'muscle':
if len(refSeqObj) > 40000:
assemble.muscle.MuscleTool().execute(
concat_file, aligned_file, quiet=False,
maxiters=2, diags=True
)
else:
assemble.muscle.MuscleTool().execute(concat_file, aligned_file, quiet=False)
elif aligner == 'mummer':
assemble.mummer.MummerTool().align_one_to_one(ref_file, actual_file, aligned_file)

# run modify_contig
args = [
aligned_file, tmpOutputFile, refName, '--call-reference-ns', '--trim-ends', '--replace-5ends',
'--replace-3ends', '--replace-length', str(replaceLength), '--replace-end-gaps'
]
if newName:
# renames the segment name "sampleName-idx" where idx is the segment number
args.extend(['--name', newName + "-" + str(idx + 1)])
args = pmc.parse_args(args)
args.func_main(args)

# clean up
os.unlink(concat_file)
os.unlink(ref_file)
os.unlink(actual_file)
os.unlink(aligned_file)
tempFastas.append(tmpOutputFile)

# merge outputs
util.file.concat(tempFastas, outFasta)
Expand Down
9 changes: 9 additions & 0 deletions test/input/TestOrderAndOrient/contigs.lasv.one_small.fasta
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
>comp2_c0_seq1 len=421 path=[0:0-420]
GGGTGAGAGTGGTGGGGGTCGGTGGGAGACTCAGGGACTGTAGGGTGGGGGTCTGATGCT
GTCTGCTGCTCTAGTTGGAGGTGCTGTTGGAGTGGCTGATGGTCTCAGCTTTGTGGGGAG
AGGCATCTTGCAAATGGGGCACCTGTTGCTGACACTCAGAAGTAAGGTGAGGCAGTTGAG
ACACAGATAGTGGTTGTTGCACTCAACCAGGCCCTTGTTTTCGAACCAGCAGCTCTTGCA
GAACTGTGGCCCTAGATGTGTGGCATCCGGGATCAGGCTGGCTCTTGGACTGTCTCTTGA
TTCTGGGGCTTTGGATTGCTTGTTTCCCATTTGGAAGAGTTCAGCGTACAAAGCCTCAAA
AAGAAGAACAACCAAATTGCCTAGGATCCCCGGTGCGCGGGTCCATTTTCCTCATTTGTA
G
57 changes: 56 additions & 1 deletion test/unit/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,38 @@ def test_obscure_mummer3_bug(self):
os.path.join(inDir, 'contig.mummer3_fail_lasv.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta)


def test_not_all_segments_fail(self):
# IncompleteAssemblyError is thrown when only some but not all segments are recovered
inDir = util.file.get_test_input_path(self)
outFasta = util.file.mkstempfname('.fasta')
self.assertRaises(assembly.IncompleteAssemblyError,
assembly.order_and_orient,
os.path.join(inDir, 'contigs.lasv.one_small.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta)

def test_not_all_segments_succeed(self):
# ... unless we specifically allow for partial outputs
inDir = util.file.get_test_input_path(self)
outFasta = util.file.mkstempfname('.fasta')
assembly.order_and_orient(
os.path.join(inDir, 'contigs.lasv.one_small.fasta'),
os.path.join(inDir, 'ref.lasv.ISTH2376.fasta'),
outFasta,
allow_incomplete_output=True)

def test_empty_output_succeed(self):
# a completely empty output should be possible if we allow it
inDir = util.file.get_test_input_path(self)
emptyFasta = os.path.join(inDir, '..', 'empty.fasta')
outFasta = util.file.mkstempfname('.fasta')
assembly.order_and_orient(
emptyFasta,
os.path.join(inDir, 'ref.influenza.fasta'),
outFasta,
allow_incomplete_output=True)
self.assertEqualContents(outFasta, emptyFasta)

class TestGap2Seq(TestCaseWithTmp):
'''Test gap-filling tool Gap2Seq'''
Expand All @@ -436,6 +467,15 @@ def test_gapfill(self):
shutil.copyfile(filled, '/tmp/filled.fasta')
self.assertEqualContents(filled, join(inDir, 'TestGap2Seq', 'expected.ebov.doublehit.gapfill.fasta'))

def test_empty_fasta_input(self):
inDir = util.file.get_test_input_path()
empty_fasta = os.path.join(inDir, 'empty.fasta')
out_fasta = util.file.mkstempfname('.fasta')
assembly.gapfill_gap2seq(in_scaffold=empty_fasta,
in_bam=os.path.join(inDir, 'G5012.3.testreads.bam'),
out_scaffold=out_fasta, random_seed=23923937, threads=1)
self.assertEqualContents(out_fasta, empty_fasta)


class TestImputeFromReference(TestCaseWithTmp):
''' Test the impute_from_reference command (align and modify_contig) '''
Expand Down Expand Up @@ -527,6 +567,21 @@ def test_small_mummer(self):
str(Bio.SeqIO.read(outFasta, 'fasta').seq),
str(Bio.SeqIO.read(expected, 'fasta').seq))

def test_empty_fasta_input(self):
inDir = util.file.get_test_input_path(self)
empty_fasta = os.path.join(inDir, '..', 'empty.fasta')
outFasta = util.file.mkstempfname('.fasta')
assembly.impute_from_reference(
empty_fasta,
os.path.join(inDir, 'ref.sub.ebov.fasta'),
outFasta,
minLengthFraction=0.0,
minUnambig=0.0,
replaceLength=5,
newName='test_sub-EBOV.genome',
aligner='mummer')
self.assertEqualContents(outFasta, empty_fasta)


class TestMutableSequence(unittest.TestCase):
''' Test the MutableSequence class '''
Expand Down

0 comments on commit 5a15d84

Please sign in to comment.