add --noextend option to CLI (related to #340)

This also runs a --noextend test set in the Makefile, and adds a test for command creation (but not output regression). The --noextend option appears to affect low coverage and identity scores quite significantly. NOTE: this is not in itself a fix for #340. That requires changes to how we write and parse nucmer output.
widdowquinn · Nov 10, 2021 · 7272ff3 · 7272ff3
1 parent 80d60cb
commit 7272ff3
Show file tree

Hide file tree

Showing 7 changed files with 96 additions and 33 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,12 +1,22 @@
 # CHANGES.md
 
+## v0.2.12-a
+
+**NOTE:** Issue #340 raised major questions about the accuracy of the MUMmer calculations in ANIm analysis. In particular, it was noted that coverage could be overreported, but there are also effects on reported percentage identity - especially at lower identity values.
+
+- add `--noextend` option for ANIm (and tests), in response to issue #340; NOTE: this does not, by itself fix issue #340
+
+## v0.2.11
+
+- fix `pandas` error handling
+
 ## v0.2.10
 
 - fix Issue #178: input filenames that could be interpreted as floats may break `labels.txt`/`classes.txt` integration, and graphical output
 
 ## v0.2.9
 
-- fix 1ssue #132: TETRA would fail if not all 4-mers present in all input sequences
+- fix issue #132: TETRA would fail if not all 4-mers present in all input sequences
 
 ## v0.2.8
 

diff --git a/Makefile b/Makefile
@@ -18,11 +18,12 @@ DATA=tests/test_ani_data
 OUT_B=tests/test_ANIb_output
 OUT_BLASTALL=tests/test_ANIblastall_output
 OUT_M=tests/test_ANIm_output
+OUT_NOEXTEND=tests/test_ANIm_noextend_output
 OUT_TETRA=tests/test_TETRA_output
 CLASSES=$(DATA)/classes.tab
 LABELS=$(DATA)/labels.tab
 GFORMAT=pdf,png,eps
-GMETHOD=mpl
+GMETHOD=seaborn
 
 # Decide whether we're skipping generation of intermediate files
 ifeq ($(LAZY), 1)
@@ -41,15 +42,25 @@ sge : clean test_sge
 
 ANIm :
 	@echo "Testing ANIm analysis..."
-	./average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \
+	average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \
 	  -m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \
 	  $(LAZYMUMMER) --gformat $(GFORMAT) --gmethod $(GMETHOD)
 	@echo "ANIm analysis test done"
 	@echo
 
+ANIm_noextend :
+	@echo "Testing ANIm analysis (--noextend)..."
+	average_nucleotide_identity.py -i $(DATA) -o $(OUT_NOEXTEND) \
+	  -m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \
+	  $(LAZYMUMMER) --gformat $(GFORMAT) --gmethod $(GMETHOD) \
+	  --noextend
+	@echo "ANIm (--noextend) analysis test done"
+	@echo
+
+
 ANIm_sge :
 	@echo "Testing ANIm analysis with SGE..."
-	./average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \
+	average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \
 	  -m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \
 	  --scheduler SGE $(LAZYMUMMER) --gformat $(GFORMAT)\
 	  --gmethod $(GMETHOD)
@@ -58,15 +69,15 @@ ANIm_sge :
 
 ANIb :
 	@echo "Testing ANIb analysis..."
-	./average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \
+	average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \
 	  -m ANIb --classes $(CLASSES) --labels $(LABELS) -g -v \
 	  $(LAZYBLAST) --gformat $(GFORMAT) --gmethod $(GMETHOD)
 	@echo "ANIb analysis test done"
 	@echo
 
 ANIb_sge :
 	@echo "Testing ANIb analysis with SGE..."
-	./average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \
+	average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \
 	  -m ANIb --classes $(CLASSES) --labels $(LABELS) -g -v \
 	  --scheduler SGE $(LAZYBLAST) --gformat $(GFORMAT) \
 	  --gmethod $(GMETHOD)
@@ -75,15 +86,15 @@ ANIb_sge :
 
 ANIblastall :
 	@echo "Testing ANIblastall analysis..."
-	./average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \
+	average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \
 	  -m ANIblastall --classes $(CLASSES) --labels $(LABELS) -g -v \
 	  $(LAZYBLAST) --gformat $(GFORMAT) --gmethod $(GMETHOD)
 	@echo "ANIblastall analysis test done"
 	@echo
 
 ANIblastall_sge :
 	@echo "Testing ANIblastall analysis with SGE..."
-	./average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \
+	average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \
 	  -m ANIblastall --classes $(CLASSES) --labels $(LABELS) -g -v \
 	  --scheduler SGE $(LAZYBLAST) --gformat $(GFORMAT)
 	  --gmethod $(GMETHOD)
@@ -92,16 +103,18 @@ ANIblastall_sge :
 
 BLAST : ANIb ANIblastall
 
+MUMmer: ANIm ANIm_noextend
+
 TETRA :
 	@echo "Testing TETRA analysis..."
-	./average_nucleotide_identity.py -i $(DATA) -o $(OUT_TETRA) \
+	average_nucleotide_identity.py -i $(DATA) -o $(OUT_TETRA) \
 	  -m TETRA --classes $(CLASSES) --labels $(LABELS) -g -v \
 	  $(LAZYTETRA) --gformat $(GFORMAT) --gmethod $(GMETHOD)
 	@echo "TETRA analysis test done"
 	@echo
 
 clean :
-	rm -rf $(OUT_M) $(OUT_B) $(OUT_BLASTALL) $(OUT_TETRA)
+	rm -rf $(OUT_M) $(OUT_NOEXTEND) $(OUT_B) $(OUT_BLASTALL) $(OUT_TETRA)
 
 test : ANIm ANIb ANIblastall TETRA
 

diff --git a/bin/average_nucleotide_identity.py b/bin/average_nucleotide_identity.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 # (c) The James Hutton Institute 2013-2019
-# (c) University of Strathclyde 2019-2020
+# (c) University of Strathclyde 2019-2021
 # Author: Leighton Pritchard
 #
 # Contact: [email protected]
@@ -17,7 +17,7 @@
 # The MIT License
 #
 # Copyright (c) 2013-2019 The James Hutton Institute
-# Copyright (c) 2019-2020 University of Strathclyde
+# Copyright (c) 2019-2021 University of Strathclyde
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -143,6 +143,8 @@
                         ANI method
   --maxmatch            Override MUMmer settings and allow all matches in
                         NUCmer
+  --noextend            Override MUMmer settings and do not extend seed
+                        alignments
   --nucmer_exe=NUCMER_EXE
                         Path to NUCmer executable
   --blast_exe=BLAST_EXE
@@ -353,6 +355,13 @@ def parse_cmdline():
         default=False,
         help="Override MUMmer to allow all NUCmer matches",
     )
+    parser.add_argument(
+        "--noextend",
+        dest="noextend",
+        action="store_true",
+        default=False,
+        help="Override MUMmer to extend seed alignments",
+    )
     parser.add_argument(
         "--nucmer_exe",
         dest="nucmer_exe",
@@ -535,6 +544,7 @@ def calculate_anim(infiles, org_lengths):
             nucmer_exe=args.nucmer_exe,
             filter_exe=args.filter_exe,
             maxmatch=args.maxmatch,
+            noextend=args.noextend,
             jobprefix=args.jobprefix,
         )
         if args.scheduler == "multiprocessing":

diff --git a/pyani/__init__.py b/pyani/__init__.py
@@ -1,3 +1,3 @@
 # python package version
 # should match r"^__version__ = '(?P<version>[^']+)'$" for setup.py
-__version__ = "0.2.11"
+__version__ = "0.2.12-a"
diff --git a/pyani/anim.py b/pyani/anim.py
@@ -36,6 +36,7 @@ def generate_nucmer_jobs(
     nucmer_exe=pyani_config.NUCMER_DEFAULT,
     filter_exe=pyani_config.FILTER_DEFAULT,
     maxmatch=False,
+    noextend=False,
     jobprefix="ANINUCmer",
 ):
     """Return a list of Jobs describing NUCmer command-lines for ANIm
@@ -44,12 +45,14 @@ def generate_nucmer_jobs(
     - outdir - path to output directory
     - nucmer_exe - location of the nucmer binary
     - maxmatch - Boolean flag indicating to use NUCmer's -maxmatch option
+    - noextend - Boolean flag indicating whether to override extension of
+    NUCmer seed alignment (use --noextend flag)
 
     Loop over all FASTA files, generating Jobs describing NUCmer command lines
     for each pairwise comparison.
     """
     ncmds, fcmds = generate_nucmer_commands(
-        filenames, outdir, nucmer_exe, filter_exe, maxmatch
+        filenames, outdir, nucmer_exe, filter_exe, maxmatch, noextend
     )
     joblist = []
     for idx, ncmd in enumerate(ncmds):
@@ -69,6 +72,7 @@ def generate_nucmer_commands(
     nucmer_exe=pyani_config.NUCMER_DEFAULT,
     filter_exe=pyani_config.FILTER_DEFAULT,
     maxmatch=False,
+    noextend=False,
 ):
     """Return a tuple of lists of NUCmer command-lines for ANIm
 
@@ -81,6 +85,8 @@ def generate_nucmer_commands(
     - outdir - path to output directory
     - nucmer_exe - location of the nucmer binary
     - maxmatch - Boolean flag indicating to use NUCmer's -maxmatch option
+    - noextend - Boolean flag indicating whether to override extension of
+    NUCmer seed alignment (use --noextend flag)
 
     Loop over all FASTA files generating NUCmer command lines for each
     pairwise comparison.
@@ -89,7 +95,7 @@ def generate_nucmer_commands(
     for idx, fname1 in enumerate(filenames[:-1]):
         for fname2 in filenames[idx + 1 :]:
             ncmd, dcmd = construct_nucmer_cmdline(
-                fname1, fname2, outdir, nucmer_exe, filter_exe, maxmatch
+                fname1, fname2, outdir, nucmer_exe, filter_exe, maxmatch, noextend
             )
             nucmer_cmdlines.append(ncmd)
             delta_filter_cmdlines.append(dcmd)
@@ -105,6 +111,7 @@ def construct_nucmer_cmdline(
     nucmer_exe=pyani_config.NUCMER_DEFAULT,
     filter_exe=pyani_config.FILTER_DEFAULT,
     maxmatch=False,
+    noextend=False,
 ):
     """Returns a tuple of NUCmer and delta-filter commands
 
@@ -120,6 +127,8 @@ def construct_nucmer_cmdline(
     - outdir - path to output directory
     - maxmatch - Boolean flag indicating whether to use NUCmer's -maxmatch
     option. If not, the -mum option is used instead
+    - noextend - Boolean flag indicating whether to override extension of
+    NUCmer seed alignment (use --noextend flag)
     """
     outsubdir = os.path.join(outdir, pyani_config.ALIGNDIR["ANIm"])
     outprefix = os.path.join(
@@ -130,10 +139,17 @@ def construct_nucmer_cmdline(
             os.path.splitext(os.path.split(fname2)[-1])[0],
         ),
     )
+
+    # Do we use maxmatch?
     if maxmatch:
         mode = "--maxmatch"
     else:
         mode = "--mum"
+
+    # Do we extend seed alignments?
+    if noextend:
+        mode += " --noextend"
+
     nucmercmd = "{0} {1} -p {2} {3} {4}".format(
         nucmer_exe, mode, outprefix, fname1, fname2
     )

diff --git a/pyani/pyani_tools.py b/pyani/pyani_tools.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 # (c) The James Hutton Institute 2016-2019
-# (c) University of Strathclyde 2019-2020
+# (c) University of Strathclyde 2019-2021
 # Author: Leighton Pritchard
 #
 # Contact: [email protected]
@@ -16,7 +16,7 @@
 # The MIT License
 #
 # Copyright (c) 2016-2019 The James Hutton Institute
-# Copyright (c) 2019-2020 University of Strathclyde
+# Copyright (c) 2019-2021 University of Strathclyde
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal

diff --git a/tests/test_anim.py b/tests/test_anim.py
@@ -12,25 +12,24 @@
 print() statements will be caught by nosetests unless there is an
 error. They can also be recovered with the -s option.
 
-(c) The James Hutton Institute 2017
-Author: Leighton Pritchard
-
-Contact:
-[email protected]
-
-Leighton Pritchard,
-Information and Computing Sciences,
-James Hutton Institute,
-Errol Road,
-Invergowrie,
-Dundee,
-DD6 9LH,
-Scotland,
-UK
+(c) The James Hutton Institute 2017-2019
+(c) University of Strathclyde 2019-2021
+# Author: Leighton Pritchard
+#
+# Contact: [email protected]
+#
+# Leighton Pritchard,
+# Strathclyde Institute for Pharmacy and Biomedical Sciences,
+# Cathedral Street,
+# Glasgow,
+# G4 0RE
+# Scotland,
+# UK
 
 The MIT License
 
-Copyright (c) 2017 The James Hutton Institute
+Copyright (c) 2017-2019 The James Hutton Institute
+Copyright (c) University of Strathclyde 2019-2021
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -83,6 +82,14 @@ def setUp(self):
                 "file1.fna file2.fna",
             ]
         )
+        self.ntgtnoextend = " ".join(
+            [
+                "nucmer --maxmatch --noextend -p",
+                "tests/test_output/anim/nucmer_output/file1_vs_file2",
+                "file1.fna file2.fna",
+            ]
+        )
+
         self.ftgt = " ".join(
             [
                 "delta_filter_wrapper.py delta-filter -1",
@@ -164,6 +171,13 @@ def test_maxmatch_cmd_generation(self):
         )
         assert_equal(ncmd, self.ntgtmax)
 
+    def test_noextend_cmd_generation(self):
+        """generate NUCmer command line with noextend."""
+        ncmd, fcmd = anim.construct_nucmer_cmdline(
+            "file1.fna", "file2.fna", outdir=self.outdir, maxmatch=True, noextend=True
+        )
+        assert_equal(ncmd, self.ntgtnoextend)
+
     def test_multi_cmd_generation(self):
         """generate multiple abstract NUCmer/delta-filter command-lines.