Skip to content

Commit

Permalink
Merge pull request #355 from peterjc/sort_genomes
Browse files Browse the repository at this point in the history
Sort genomes (for stable anim recovery mode)
  • Loading branch information
baileythegreen authored Nov 30, 2021
2 parents 4dd4f83 + 0a0f551 commit c0fdf6d
Show file tree
Hide file tree
Showing 5 changed files with 40 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pyani/anim.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def construct_nucmer_cmdline(
outdir, called "nucmer_output".
"""
# Cast path strings to pathlib.Path for safety
fname1, fname2 = Path(fname1), Path(fname2)
fname1, fname2 = sorted([Path(fname1), Path(fname2)])

# Compile commands
# Nested output folders to avoid N^2 scaling in files-per-folder
Expand Down
10 changes: 5 additions & 5 deletions pyani/pyani_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,15 +74,15 @@ def get_fasta_paths(
:param dirname: Path, path to directory containing input FASTA files
:param extlist: List, file suffixes for FASTA files
Returns the full path to each file.
Returns sorted list of the full path to each file.
"""
# Lists are dangerous to have as default function arguments
extlist = extlist or [".fna", ".fa", ".fasta", ".fas"]
return [
return sorted(
fname
for fname in dirname.iterdir()
if fname.is_file() and fname.suffix in extlist
]
)


# Get a list of FASTA files and corresponding hashes from the input directory
Expand Down Expand Up @@ -116,12 +116,12 @@ def get_fasta_and_hash_paths(dirname: Path = Path(".")) -> List[Tuple[Path, Path

# Get list of FASTA files in a directory
def get_input_files(dirname: Path, *ext) -> List[Path]:
"""Return files in passed directory, filtered by extension.
"""Return sorted files in passed directory, filtered by extension.
:param dirname: Path, path to input directory
:param *ext: optional iterable of arguments describing permitted file extensions
"""
return [fname for fname in dirname.iterdir() if fname.suffix in ext]
return sorted(fname for fname in dirname.iterdir() if fname.suffix in ext)


# Get lengths of input sequences
Expand Down
10 changes: 10 additions & 0 deletions pyani/scripts/subcommands/subcmd_anim.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ def generate_joblist(
existingfiles = set(existingfiles) # Path objects hashable

joblist = [] # will hold ComparisonJob structs
jobs = {"new": 0, "old": 0} # will hold counts of new/old jobs for reporting
for idx, (query, subject) in enumerate(
tqdm(comparisons, disable=args.disable_tqdm)
):
Expand Down Expand Up @@ -350,13 +351,22 @@ def generate_joblist(
logger.debug("Recovering output from %s, not submitting job", outfname)
# Need to track the expected output, but set the job itself to None:
joblist.append(ComparisonJob(query, subject, dcmd, ncmd, outfname, None))
jobs["old"] += 1
else:
logger.debug("Building job")
# Build jobs
njob = pyani_jobs.Job("%s_%06d-n" % (args.jobprefix, idx), ncmd)
fjob = pyani_jobs.Job("%s_%06d-f" % (args.jobprefix, idx), dcmd)
fjob.add_dependency(njob)
joblist.append(ComparisonJob(query, subject, dcmd, ncmd, outfname, fjob))
jobs["new"] += 1
logger.info(
"Results not found for %d comparisons; %d new jobs built.",
jobs["new"],
jobs["new"],
)
if existingfiles:
logger.info("Retrieving results for %d previous comparisons.", jobs["old"])
return joblist


Expand Down
13 changes: 13 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@


# Convenience structs to emulate returned objects
class MockGenome(NamedTuple):
"""Mock genome object."""

path: str


class MockProcess(NamedTuple):
"""Mock process object."""

Expand Down Expand Up @@ -267,6 +273,13 @@ def fragment_length():
return FRAGSIZE


@pytest.fixture
def unsorted_genomes(dir_anim_in):
"""Tests ordering of genome names in output file names for asymmetric analyses."""
dir_anim_in = str(dir_anim_in)
return (MockGenome(f"{dir_anim_in}/second"), MockGenome(f"{dir_anim_in}/first"))


@pytest.fixture
def mock_legacy_single_genome_dl(monkeypatch):
"""Mocks remote database calls for single-genome downloads.
Expand Down
11 changes: 11 additions & 0 deletions tests/test_anim.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,3 +250,14 @@ def test_mummer_job_generation(mummer_cmds_four):
assert job.name == "test_%06d-f" % idx # filter job name
assert len(job.dependencies) == 1 # has NUCmer job
assert job.dependencies[0].name == "test_%06d-n" % idx


def test_genome_sorting(tmp_path, unsorted_genomes):
second, first = [Path(_.path) for _ in unsorted_genomes]
outprefix = f"{tmp_path}/nucmer_output/{first.stem}/{first.stem}_vs_{second.stem}"
expected = (
f"nucmer --mum -p {outprefix} {first} {second}",
f"delta_filter_wrapper.py delta-filter -1 {outprefix}.delta {outprefix}.filter",
)
nucmercmd, filtercmd = anim.construct_nucmer_cmdline(second, first, tmp_path)
assert (nucmercmd, filtercmd) == expected

0 comments on commit c0fdf6d

Please sign in to comment.