Skip to content

Commit

Permalink
add --noextend option to CLI (related to #340)
Browse files Browse the repository at this point in the history
This also runs a --noextend test set in the Makefile, and adds a test
for command creation (but not output regression).

The --noextend option appears to affect low coverage and identity
scores quite significantly.

NOTE: this is not in itself a fix for #340. That requires changes to
how we write and parse nucmer output.
  • Loading branch information
widdowquinn committed Nov 10, 2021
1 parent 80d60cb commit 7272ff3
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 33 deletions.
12 changes: 11 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,12 +1,22 @@
# CHANGES.md

## v0.2.12-a

**NOTE:** Issue #340 raised major questions about the accuracy of the MUMmer calculations in ANIm analysis. In particular, it was noted that coverage could be overreported, but there are also effects on reported percentage identity - especially at lower identity values.

- add `--noextend` option for ANIm (and tests), in response to issue #340; NOTE: this does not, by itself fix issue #340

## v0.2.11

- fix `pandas` error handling

## v0.2.10

- fix Issue #178: input filenames that could be interpreted as floats may break `labels.txt`/`classes.txt` integration, and graphical output

## v0.2.9

- fix 1ssue #132: TETRA would fail if not all 4-mers present in all input sequences
- fix issue #132: TETRA would fail if not all 4-mers present in all input sequences

## v0.2.8

Expand Down
31 changes: 22 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,12 @@ DATA=tests/test_ani_data
OUT_B=tests/test_ANIb_output
OUT_BLASTALL=tests/test_ANIblastall_output
OUT_M=tests/test_ANIm_output
OUT_NOEXTEND=tests/test_ANIm_noextend_output
OUT_TETRA=tests/test_TETRA_output
CLASSES=$(DATA)/classes.tab
LABELS=$(DATA)/labels.tab
GFORMAT=pdf,png,eps
GMETHOD=mpl
GMETHOD=seaborn

# Decide whether we're skipping generation of intermediate files
ifeq ($(LAZY), 1)
Expand All @@ -41,15 +42,25 @@ sge : clean test_sge

ANIm :
@echo "Testing ANIm analysis..."
./average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \
average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \
-m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \
$(LAZYMUMMER) --gformat $(GFORMAT) --gmethod $(GMETHOD)
@echo "ANIm analysis test done"
@echo

ANIm_noextend :
@echo "Testing ANIm analysis (--noextend)..."
average_nucleotide_identity.py -i $(DATA) -o $(OUT_NOEXTEND) \
-m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \
$(LAZYMUMMER) --gformat $(GFORMAT) --gmethod $(GMETHOD) \
--noextend
@echo "ANIm (--noextend) analysis test done"
@echo


ANIm_sge :
@echo "Testing ANIm analysis with SGE..."
./average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \
average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \
-m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \
--scheduler SGE $(LAZYMUMMER) --gformat $(GFORMAT)\
--gmethod $(GMETHOD)
Expand All @@ -58,15 +69,15 @@ ANIm_sge :

ANIb :
@echo "Testing ANIb analysis..."
./average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \
average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \
-m ANIb --classes $(CLASSES) --labels $(LABELS) -g -v \
$(LAZYBLAST) --gformat $(GFORMAT) --gmethod $(GMETHOD)
@echo "ANIb analysis test done"
@echo

ANIb_sge :
@echo "Testing ANIb analysis with SGE..."
./average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \
average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \
-m ANIb --classes $(CLASSES) --labels $(LABELS) -g -v \
--scheduler SGE $(LAZYBLAST) --gformat $(GFORMAT) \
--gmethod $(GMETHOD)
Expand All @@ -75,15 +86,15 @@ ANIb_sge :

ANIblastall :
@echo "Testing ANIblastall analysis..."
./average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \
average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \
-m ANIblastall --classes $(CLASSES) --labels $(LABELS) -g -v \
$(LAZYBLAST) --gformat $(GFORMAT) --gmethod $(GMETHOD)
@echo "ANIblastall analysis test done"
@echo

ANIblastall_sge :
@echo "Testing ANIblastall analysis with SGE..."
./average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \
average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \
-m ANIblastall --classes $(CLASSES) --labels $(LABELS) -g -v \
--scheduler SGE $(LAZYBLAST) --gformat $(GFORMAT)
--gmethod $(GMETHOD)
Expand All @@ -92,16 +103,18 @@ ANIblastall_sge :

BLAST : ANIb ANIblastall

MUMmer: ANIm ANIm_noextend

TETRA :
@echo "Testing TETRA analysis..."
./average_nucleotide_identity.py -i $(DATA) -o $(OUT_TETRA) \
average_nucleotide_identity.py -i $(DATA) -o $(OUT_TETRA) \
-m TETRA --classes $(CLASSES) --labels $(LABELS) -g -v \
$(LAZYTETRA) --gformat $(GFORMAT) --gmethod $(GMETHOD)
@echo "TETRA analysis test done"
@echo

clean :
rm -rf $(OUT_M) $(OUT_B) $(OUT_BLASTALL) $(OUT_TETRA)
rm -rf $(OUT_M) $(OUT_NOEXTEND) $(OUT_B) $(OUT_BLASTALL) $(OUT_TETRA)

test : ANIm ANIb ANIblastall TETRA

Expand Down
14 changes: 12 additions & 2 deletions bin/average_nucleotide_identity.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# (c) The James Hutton Institute 2013-2019
# (c) University of Strathclyde 2019-2020
# (c) University of Strathclyde 2019-2021
# Author: Leighton Pritchard
#
# Contact: [email protected]
Expand All @@ -17,7 +17,7 @@
# The MIT License
#
# Copyright (c) 2013-2019 The James Hutton Institute
# Copyright (c) 2019-2020 University of Strathclyde
# Copyright (c) 2019-2021 University of Strathclyde
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -143,6 +143,8 @@
ANI method
--maxmatch Override MUMmer settings and allow all matches in
NUCmer
--noextend Override MUMmer settings and do not extend seed
alignments
--nucmer_exe=NUCMER_EXE
Path to NUCmer executable
--blast_exe=BLAST_EXE
Expand Down Expand Up @@ -353,6 +355,13 @@ def parse_cmdline():
default=False,
help="Override MUMmer to allow all NUCmer matches",
)
parser.add_argument(
"--noextend",
dest="noextend",
action="store_true",
default=False,
help="Override MUMmer to extend seed alignments",
)
parser.add_argument(
"--nucmer_exe",
dest="nucmer_exe",
Expand Down Expand Up @@ -535,6 +544,7 @@ def calculate_anim(infiles, org_lengths):
nucmer_exe=args.nucmer_exe,
filter_exe=args.filter_exe,
maxmatch=args.maxmatch,
noextend=args.noextend,
jobprefix=args.jobprefix,
)
if args.scheduler == "multiprocessing":
Expand Down
2 changes: 1 addition & 1 deletion pyani/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# python package version
# should match r"^__version__ = '(?P<version>[^']+)'$" for setup.py
__version__ = "0.2.11"
__version__ = "0.2.12-a"
20 changes: 18 additions & 2 deletions pyani/anim.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ def generate_nucmer_jobs(
nucmer_exe=pyani_config.NUCMER_DEFAULT,
filter_exe=pyani_config.FILTER_DEFAULT,
maxmatch=False,
noextend=False,
jobprefix="ANINUCmer",
):
"""Return a list of Jobs describing NUCmer command-lines for ANIm
Expand All @@ -44,12 +45,14 @@ def generate_nucmer_jobs(
- outdir - path to output directory
- nucmer_exe - location of the nucmer binary
- maxmatch - Boolean flag indicating to use NUCmer's -maxmatch option
- noextend - Boolean flag indicating whether to override extension of
NUCmer seed alignment (use --noextend flag)
Loop over all FASTA files, generating Jobs describing NUCmer command lines
for each pairwise comparison.
"""
ncmds, fcmds = generate_nucmer_commands(
filenames, outdir, nucmer_exe, filter_exe, maxmatch
filenames, outdir, nucmer_exe, filter_exe, maxmatch, noextend
)
joblist = []
for idx, ncmd in enumerate(ncmds):
Expand All @@ -69,6 +72,7 @@ def generate_nucmer_commands(
nucmer_exe=pyani_config.NUCMER_DEFAULT,
filter_exe=pyani_config.FILTER_DEFAULT,
maxmatch=False,
noextend=False,
):
"""Return a tuple of lists of NUCmer command-lines for ANIm
Expand All @@ -81,6 +85,8 @@ def generate_nucmer_commands(
- outdir - path to output directory
- nucmer_exe - location of the nucmer binary
- maxmatch - Boolean flag indicating to use NUCmer's -maxmatch option
- noextend - Boolean flag indicating whether to override extension of
NUCmer seed alignment (use --noextend flag)
Loop over all FASTA files generating NUCmer command lines for each
pairwise comparison.
Expand All @@ -89,7 +95,7 @@ def generate_nucmer_commands(
for idx, fname1 in enumerate(filenames[:-1]):
for fname2 in filenames[idx + 1 :]:
ncmd, dcmd = construct_nucmer_cmdline(
fname1, fname2, outdir, nucmer_exe, filter_exe, maxmatch
fname1, fname2, outdir, nucmer_exe, filter_exe, maxmatch, noextend
)
nucmer_cmdlines.append(ncmd)
delta_filter_cmdlines.append(dcmd)
Expand All @@ -105,6 +111,7 @@ def construct_nucmer_cmdline(
nucmer_exe=pyani_config.NUCMER_DEFAULT,
filter_exe=pyani_config.FILTER_DEFAULT,
maxmatch=False,
noextend=False,
):
"""Returns a tuple of NUCmer and delta-filter commands
Expand All @@ -120,6 +127,8 @@ def construct_nucmer_cmdline(
- outdir - path to output directory
- maxmatch - Boolean flag indicating whether to use NUCmer's -maxmatch
option. If not, the -mum option is used instead
- noextend - Boolean flag indicating whether to override extension of
NUCmer seed alignment (use --noextend flag)
"""
outsubdir = os.path.join(outdir, pyani_config.ALIGNDIR["ANIm"])
outprefix = os.path.join(
Expand All @@ -130,10 +139,17 @@ def construct_nucmer_cmdline(
os.path.splitext(os.path.split(fname2)[-1])[0],
),
)

# Do we use maxmatch?
if maxmatch:
mode = "--maxmatch"
else:
mode = "--mum"

# Do we extend seed alignments?
if noextend:
mode += " --noextend"

nucmercmd = "{0} {1} -p {2} {3} {4}".format(
nucmer_exe, mode, outprefix, fname1, fname2
)
Expand Down
4 changes: 2 additions & 2 deletions pyani/pyani_tools.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# (c) The James Hutton Institute 2016-2019
# (c) University of Strathclyde 2019-2020
# (c) University of Strathclyde 2019-2021
# Author: Leighton Pritchard
#
# Contact: [email protected]
Expand All @@ -16,7 +16,7 @@
# The MIT License
#
# Copyright (c) 2016-2019 The James Hutton Institute
# Copyright (c) 2019-2020 University of Strathclyde
# Copyright (c) 2019-2021 University of Strathclyde
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
Expand Down
46 changes: 30 additions & 16 deletions tests/test_anim.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,25 +12,24 @@
print() statements will be caught by nosetests unless there is an
error. They can also be recovered with the -s option.
(c) The James Hutton Institute 2017
Author: Leighton Pritchard
Contact:
[email protected]
Leighton Pritchard,
Information and Computing Sciences,
James Hutton Institute,
Errol Road,
Invergowrie,
Dundee,
DD6 9LH,
Scotland,
UK
(c) The James Hutton Institute 2017-2019
(c) University of Strathclyde 2019-2021
# Author: Leighton Pritchard
#
# Contact: [email protected]
#
# Leighton Pritchard,
# Strathclyde Institute for Pharmacy and Biomedical Sciences,
# Cathedral Street,
# Glasgow,
# G4 0RE
# Scotland,
# UK
The MIT License
Copyright (c) 2017 The James Hutton Institute
Copyright (c) 2017-2019 The James Hutton Institute
Copyright (c) University of Strathclyde 2019-2021
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand Down Expand Up @@ -83,6 +82,14 @@ def setUp(self):
"file1.fna file2.fna",
]
)
self.ntgtnoextend = " ".join(
[
"nucmer --maxmatch --noextend -p",
"tests/test_output/anim/nucmer_output/file1_vs_file2",
"file1.fna file2.fna",
]
)

self.ftgt = " ".join(
[
"delta_filter_wrapper.py delta-filter -1",
Expand Down Expand Up @@ -164,6 +171,13 @@ def test_maxmatch_cmd_generation(self):
)
assert_equal(ncmd, self.ntgtmax)

def test_noextend_cmd_generation(self):
"""generate NUCmer command line with noextend."""
ncmd, fcmd = anim.construct_nucmer_cmdline(
"file1.fna", "file2.fna", outdir=self.outdir, maxmatch=True, noextend=True
)
assert_equal(ncmd, self.ntgtnoextend)

def test_multi_cmd_generation(self):
"""generate multiple abstract NUCmer/delta-filter command-lines.
Expand Down

0 comments on commit 7272ff3

Please sign in to comment.