From 7272ff37e428bb34a3419639dad2c6d14a1ccf3f Mon Sep 17 00:00:00 2001 From: widdowquinn Date: Wed, 10 Nov 2021 15:47:56 +0000 Subject: [PATCH] add --noextend option to CLI (related to #340) This also runs a --noextend test set in the Makefile, and adds a test for command creation (but not output regression). The --noextend option appears to affect low coverage and identity scores quite significantly. NOTE: this is not in itself a fix for #340. That requires changes to how we write and parse nucmer output. --- CHANGES.md | 12 +++++++- Makefile | 31 ++++++++++++++------ bin/average_nucleotide_identity.py | 14 +++++++-- pyani/__init__.py | 2 +- pyani/anim.py | 20 +++++++++++-- pyani/pyani_tools.py | 4 +-- tests/test_anim.py | 46 +++++++++++++++++++----------- 7 files changed, 96 insertions(+), 33 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 3023d174..eb90e8d9 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,12 +1,22 @@ # CHANGES.md +## v0.2.12-a + +**NOTE:** Issue #340 raised major questions about the accuracy of the MUMmer calculations in ANIm analysis. In particular, it was noted that coverage could be overreported, but there are also effects on reported percentage identity - especially at lower identity values. + +- add `--noextend` option for ANIm (and tests), in response to issue #340; NOTE: this does not, by itself fix issue #340 + +## v0.2.11 + +- fix `pandas` error handling + ## v0.2.10 - fix Issue #178: input filenames that could be interpreted as floats may break `labels.txt`/`classes.txt` integration, and graphical output ## v0.2.9 -- fix 1ssue #132: TETRA would fail if not all 4-mers present in all input sequences +- fix issue #132: TETRA would fail if not all 4-mers present in all input sequences ## v0.2.8 diff --git a/Makefile b/Makefile index 047d23f0..99ba1cc9 100644 --- a/Makefile +++ b/Makefile @@ -18,11 +18,12 @@ DATA=tests/test_ani_data OUT_B=tests/test_ANIb_output OUT_BLASTALL=tests/test_ANIblastall_output OUT_M=tests/test_ANIm_output +OUT_NOEXTEND=tests/test_ANIm_noextend_output OUT_TETRA=tests/test_TETRA_output CLASSES=$(DATA)/classes.tab LABELS=$(DATA)/labels.tab GFORMAT=pdf,png,eps -GMETHOD=mpl +GMETHOD=seaborn # Decide whether we're skipping generation of intermediate files ifeq ($(LAZY), 1) @@ -41,15 +42,25 @@ sge : clean test_sge ANIm : @echo "Testing ANIm analysis..." - ./average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \ + average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \ -m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \ $(LAZYMUMMER) --gformat $(GFORMAT) --gmethod $(GMETHOD) @echo "ANIm analysis test done" @echo +ANIm_noextend : + @echo "Testing ANIm analysis (--noextend)..." + average_nucleotide_identity.py -i $(DATA) -o $(OUT_NOEXTEND) \ + -m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \ + $(LAZYMUMMER) --gformat $(GFORMAT) --gmethod $(GMETHOD) \ + --noextend + @echo "ANIm (--noextend) analysis test done" + @echo + + ANIm_sge : @echo "Testing ANIm analysis with SGE..." - ./average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \ + average_nucleotide_identity.py -i $(DATA) -o $(OUT_M) \ -m ANIm --classes $(CLASSES) --labels $(LABELS) -g -v \ --scheduler SGE $(LAZYMUMMER) --gformat $(GFORMAT)\ --gmethod $(GMETHOD) @@ -58,7 +69,7 @@ ANIm_sge : ANIb : @echo "Testing ANIb analysis..." - ./average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \ + average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \ -m ANIb --classes $(CLASSES) --labels $(LABELS) -g -v \ $(LAZYBLAST) --gformat $(GFORMAT) --gmethod $(GMETHOD) @echo "ANIb analysis test done" @@ -66,7 +77,7 @@ ANIb : ANIb_sge : @echo "Testing ANIb analysis with SGE..." - ./average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \ + average_nucleotide_identity.py -i $(DATA) -o $(OUT_B) \ -m ANIb --classes $(CLASSES) --labels $(LABELS) -g -v \ --scheduler SGE $(LAZYBLAST) --gformat $(GFORMAT) \ --gmethod $(GMETHOD) @@ -75,7 +86,7 @@ ANIb_sge : ANIblastall : @echo "Testing ANIblastall analysis..." - ./average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \ + average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \ -m ANIblastall --classes $(CLASSES) --labels $(LABELS) -g -v \ $(LAZYBLAST) --gformat $(GFORMAT) --gmethod $(GMETHOD) @echo "ANIblastall analysis test done" @@ -83,7 +94,7 @@ ANIblastall : ANIblastall_sge : @echo "Testing ANIblastall analysis with SGE..." - ./average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \ + average_nucleotide_identity.py -i $(DATA) -o $(OUT_BLASTALL) \ -m ANIblastall --classes $(CLASSES) --labels $(LABELS) -g -v \ --scheduler SGE $(LAZYBLAST) --gformat $(GFORMAT) --gmethod $(GMETHOD) @@ -92,16 +103,18 @@ ANIblastall_sge : BLAST : ANIb ANIblastall +MUMmer: ANIm ANIm_noextend + TETRA : @echo "Testing TETRA analysis..." - ./average_nucleotide_identity.py -i $(DATA) -o $(OUT_TETRA) \ + average_nucleotide_identity.py -i $(DATA) -o $(OUT_TETRA) \ -m TETRA --classes $(CLASSES) --labels $(LABELS) -g -v \ $(LAZYTETRA) --gformat $(GFORMAT) --gmethod $(GMETHOD) @echo "TETRA analysis test done" @echo clean : - rm -rf $(OUT_M) $(OUT_B) $(OUT_BLASTALL) $(OUT_TETRA) + rm -rf $(OUT_M) $(OUT_NOEXTEND) $(OUT_B) $(OUT_BLASTALL) $(OUT_TETRA) test : ANIm ANIb ANIblastall TETRA diff --git a/bin/average_nucleotide_identity.py b/bin/average_nucleotide_identity.py index a24a2dac..24d6659a 100755 --- a/bin/average_nucleotide_identity.py +++ b/bin/average_nucleotide_identity.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- # (c) The James Hutton Institute 2013-2019 -# (c) University of Strathclyde 2019-2020 +# (c) University of Strathclyde 2019-2021 # Author: Leighton Pritchard # # Contact: leighton.pritchard@strath.ac.uk @@ -17,7 +17,7 @@ # The MIT License # # Copyright (c) 2013-2019 The James Hutton Institute -# Copyright (c) 2019-2020 University of Strathclyde +# Copyright (c) 2019-2021 University of Strathclyde # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -143,6 +143,8 @@ ANI method --maxmatch Override MUMmer settings and allow all matches in NUCmer + --noextend Override MUMmer settings and do not extend seed + alignments --nucmer_exe=NUCMER_EXE Path to NUCmer executable --blast_exe=BLAST_EXE @@ -353,6 +355,13 @@ def parse_cmdline(): default=False, help="Override MUMmer to allow all NUCmer matches", ) + parser.add_argument( + "--noextend", + dest="noextend", + action="store_true", + default=False, + help="Override MUMmer to extend seed alignments", + ) parser.add_argument( "--nucmer_exe", dest="nucmer_exe", @@ -535,6 +544,7 @@ def calculate_anim(infiles, org_lengths): nucmer_exe=args.nucmer_exe, filter_exe=args.filter_exe, maxmatch=args.maxmatch, + noextend=args.noextend, jobprefix=args.jobprefix, ) if args.scheduler == "multiprocessing": diff --git a/pyani/__init__.py b/pyani/__init__.py index ea0b0ec2..94f0ada8 100644 --- a/pyani/__init__.py +++ b/pyani/__init__.py @@ -1,3 +1,3 @@ # python package version # should match r"^__version__ = '(?P[^']+)'$" for setup.py -__version__ = "0.2.11" +__version__ = "0.2.12-a" diff --git a/pyani/anim.py b/pyani/anim.py index f1146a82..8acf2871 100644 --- a/pyani/anim.py +++ b/pyani/anim.py @@ -36,6 +36,7 @@ def generate_nucmer_jobs( nucmer_exe=pyani_config.NUCMER_DEFAULT, filter_exe=pyani_config.FILTER_DEFAULT, maxmatch=False, + noextend=False, jobprefix="ANINUCmer", ): """Return a list of Jobs describing NUCmer command-lines for ANIm @@ -44,12 +45,14 @@ def generate_nucmer_jobs( - outdir - path to output directory - nucmer_exe - location of the nucmer binary - maxmatch - Boolean flag indicating to use NUCmer's -maxmatch option + - noextend - Boolean flag indicating whether to override extension of + NUCmer seed alignment (use --noextend flag) Loop over all FASTA files, generating Jobs describing NUCmer command lines for each pairwise comparison. """ ncmds, fcmds = generate_nucmer_commands( - filenames, outdir, nucmer_exe, filter_exe, maxmatch + filenames, outdir, nucmer_exe, filter_exe, maxmatch, noextend ) joblist = [] for idx, ncmd in enumerate(ncmds): @@ -69,6 +72,7 @@ def generate_nucmer_commands( nucmer_exe=pyani_config.NUCMER_DEFAULT, filter_exe=pyani_config.FILTER_DEFAULT, maxmatch=False, + noextend=False, ): """Return a tuple of lists of NUCmer command-lines for ANIm @@ -81,6 +85,8 @@ def generate_nucmer_commands( - outdir - path to output directory - nucmer_exe - location of the nucmer binary - maxmatch - Boolean flag indicating to use NUCmer's -maxmatch option + - noextend - Boolean flag indicating whether to override extension of + NUCmer seed alignment (use --noextend flag) Loop over all FASTA files generating NUCmer command lines for each pairwise comparison. @@ -89,7 +95,7 @@ def generate_nucmer_commands( for idx, fname1 in enumerate(filenames[:-1]): for fname2 in filenames[idx + 1 :]: ncmd, dcmd = construct_nucmer_cmdline( - fname1, fname2, outdir, nucmer_exe, filter_exe, maxmatch + fname1, fname2, outdir, nucmer_exe, filter_exe, maxmatch, noextend ) nucmer_cmdlines.append(ncmd) delta_filter_cmdlines.append(dcmd) @@ -105,6 +111,7 @@ def construct_nucmer_cmdline( nucmer_exe=pyani_config.NUCMER_DEFAULT, filter_exe=pyani_config.FILTER_DEFAULT, maxmatch=False, + noextend=False, ): """Returns a tuple of NUCmer and delta-filter commands @@ -120,6 +127,8 @@ def construct_nucmer_cmdline( - outdir - path to output directory - maxmatch - Boolean flag indicating whether to use NUCmer's -maxmatch option. If not, the -mum option is used instead + - noextend - Boolean flag indicating whether to override extension of + NUCmer seed alignment (use --noextend flag) """ outsubdir = os.path.join(outdir, pyani_config.ALIGNDIR["ANIm"]) outprefix = os.path.join( @@ -130,10 +139,17 @@ def construct_nucmer_cmdline( os.path.splitext(os.path.split(fname2)[-1])[0], ), ) + + # Do we use maxmatch? if maxmatch: mode = "--maxmatch" else: mode = "--mum" + + # Do we extend seed alignments? + if noextend: + mode += " --noextend" + nucmercmd = "{0} {1} -p {2} {3} {4}".format( nucmer_exe, mode, outprefix, fname1, fname2 ) diff --git a/pyani/pyani_tools.py b/pyani/pyani_tools.py index 9c046005..f6333ff2 100644 --- a/pyani/pyani_tools.py +++ b/pyani/pyani_tools.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- # (c) The James Hutton Institute 2016-2019 -# (c) University of Strathclyde 2019-2020 +# (c) University of Strathclyde 2019-2021 # Author: Leighton Pritchard # # Contact: leighton.pritchard@strath.ac.uk @@ -16,7 +16,7 @@ # The MIT License # # Copyright (c) 2016-2019 The James Hutton Institute -# Copyright (c) 2019-2020 University of Strathclyde +# Copyright (c) 2019-2021 University of Strathclyde # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/tests/test_anim.py b/tests/test_anim.py index 0c843b6e..e19971fd 100644 --- a/tests/test_anim.py +++ b/tests/test_anim.py @@ -12,25 +12,24 @@ print() statements will be caught by nosetests unless there is an error. They can also be recovered with the -s option. -(c) The James Hutton Institute 2017 -Author: Leighton Pritchard - -Contact: -leighton.pritchard@hutton.ac.uk - -Leighton Pritchard, -Information and Computing Sciences, -James Hutton Institute, -Errol Road, -Invergowrie, -Dundee, -DD6 9LH, -Scotland, -UK +(c) The James Hutton Institute 2017-2019 +(c) University of Strathclyde 2019-2021 +# Author: Leighton Pritchard +# +# Contact: leighton.pritchard@strath.ac.uk +# +# Leighton Pritchard, +# Strathclyde Institute for Pharmacy and Biomedical Sciences, +# Cathedral Street, +# Glasgow, +# G4 0RE +# Scotland, +# UK The MIT License -Copyright (c) 2017 The James Hutton Institute +Copyright (c) 2017-2019 The James Hutton Institute +Copyright (c) University of Strathclyde 2019-2021 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -83,6 +82,14 @@ def setUp(self): "file1.fna file2.fna", ] ) + self.ntgtnoextend = " ".join( + [ + "nucmer --maxmatch --noextend -p", + "tests/test_output/anim/nucmer_output/file1_vs_file2", + "file1.fna file2.fna", + ] + ) + self.ftgt = " ".join( [ "delta_filter_wrapper.py delta-filter -1", @@ -164,6 +171,13 @@ def test_maxmatch_cmd_generation(self): ) assert_equal(ncmd, self.ntgtmax) + def test_noextend_cmd_generation(self): + """generate NUCmer command line with noextend.""" + ncmd, fcmd = anim.construct_nucmer_cmdline( + "file1.fna", "file2.fna", outdir=self.outdir, maxmatch=True, noextend=True + ) + assert_equal(ncmd, self.ntgtnoextend) + def test_multi_cmd_generation(self): """generate multiple abstract NUCmer/delta-filter command-lines.