Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

skani-based reference selection tools #39

Merged
merged 14 commits into from
Mar 19, 2024
126 changes: 126 additions & 0 deletions assemble/skani.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
'''
SKANI - accurate, fast nucleotide identity calculation for MAGs and databases
https://github.com/bluenote-1577/skani
'''

__author__ = "[email protected]"

import logging
import tools
import util.file
import util.misc
import csv
import os
import os.path
import shutil
import subprocess

TOOL_NAME = "skani"

_log = logging.getLogger(__name__)

class UndirectedGraph:
''' Simple utility class for finding clusters from pairwise relationships
'''
def __init__(self):
self.edges = {}

def add_node(self, node):
self.edges.setdefault(node, set())

def add_edge(self, node1, node2):
self.edges.setdefault(node1, set()).add(node2)
self.edges.setdefault(node2, set()).add(node1)

def _dfs(self, node, visited):
visited.add(node)
cluster = set()
cluster.add(node)
for neighbor in self.edges[node]:
if neighbor not in visited:
cluster.update(self._dfs(neighbor, visited))
return cluster

def get_clusters(self):
visited = set()
clusters = []
for node in self.edges.keys():
if node not in visited:
clusters.append(self._dfs(node, visited))
return clusters


class SkaniTool(tools.Tool):

def __init__(self, install_methods=None):
if install_methods is None:
install_methods = [tools.PrexistingUnixCommand(shutil.which(TOOL_NAME), require_executability=True)]
super(SkaniTool, self).__init__(install_methods=install_methods)

def version(self):
if self.tool_version is None:
self._get_tool_version()
return self.tool_version

def _get_tool_version(self):
self.tool_version = subprocess.check_output([self.install_and_get_path(), '--version']).decode('UTF-8').strip().split()[1]

def execute(self, subcommand, args, outfile, threads=None):
''' generic execution of skani
'''

# build the skani command
tool_cmd = [self.install_and_get_path(), subcommand]
tool_cmd.extend(map(str, args))
tool_cmd.extend(["-t", "{}".format(util.misc.sanitize_thread_count(threads))])

# run the command
_log.debug(' '.join(tool_cmd) + ' > ' + outfile)
with open(outfile, 'w') as outf:
util.misc.run_and_save(tool_cmd, outf=outf)

def triangle(self, ref_fastas, outfile_ani, other_args = (), threads=None):
''' skani triangle computes an all-to-all ANI distance matrix for a set of sequences
'''
self.execute('triangle', list(ref_fastas) + list(other_args), outfile_ani, threads=threads)

def dist(self, query_fasta, ref_fastas, outfile, other_args = (), threads=None):
''' skani dist computes ANI distance between a specified query set of
sequences (MAGs) and reference genomes (database)
'''
self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads)

def find_reference_clusters(self, ref_fastas,
other_args = ('-m', 50, '--no-learned-ani', '--slow', '--robust', '--detailed', '--ci', '--sparse'),
threads=None):
''' use skani triangle to define clusters of highly-related genomes
(default settings here are for viral genomes)
'''
g = UndirectedGraph()
for ref_fasta in ref_fastas:
g.add_node(ref_fasta)

with util.file.tempfname('.skani_matrix.ani') as tmp_matrix_ani:
# run skani triangle
self.triangle(ref_fastas, tmp_matrix_ani, other_args, threads=threads)

# parse the skani triangle results and define clusters
with open(tmp_matrix_ani, 'r') as inf:
for row in csv.DictReader(inf, delimiter='\t'):
g.add_edge(row['Ref_file'], row['Query_file'])

return g.get_clusters()

def find_closest_reference(self, contigs_fasta, ref_fastas, out_file,
other_args = ('-m', 50, '--no-learned-ani', '--slow', '--robust', '--detailed', '--ci', '-s', 85, '-n', 10, '--no-marker-index'),
threads=None):
''' use skani dist to find the closest reference genome for each contig
(default settings here are for viral genomes)
'''
self.dist(contigs_fasta, ref_fastas, out_file, other_args, threads=threads)
with open(out_file, 'r') as inf:
top_row = None
for row in csv.DictReader(inf, delimiter='\t'):
top_row = row
break
return (top_row['Ref_file'], top_row['ANI'], top_row['Align_fraction_ref'], top_row['Total_bases_covered']) if top_row is not None else None
62 changes: 61 additions & 1 deletion assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import assemble.mummer
import assemble.muscle
import assemble.gap2seq
import assemble.skani

# third-party
import numpy
Expand Down Expand Up @@ -375,10 +376,69 @@ def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gap
util.cmd.attach_main(parser, gapfill_gap2seq, split_args=True)
return parser


__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq))


def cluster_references_ani(inRefs, outClusters, threads=None):
''' This step uses the skani triangle tool to define clusters of highly-related genomes.
'''
skani = assemble.skani.SkaniTool()
clusters = skani.find_reference_clusters(inRefs, threads=threads)
with open(outClusters, 'w') as outf:
for cluster in clusters:
outf.write('\t'.join(cluster) + '\n')

def parser_cluster_references_ani(parser=argparse.ArgumentParser(description='Cluster references')):
parser.add_argument('inRefs', nargs='+', help='FASTA files containing reference genomes')
parser.add_argument('outClusters', help='Output file containing clusters of highly-related genomes. Each line contains the filenames of the genomes in one cluster.')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, cluster_references_ani, split_args=True)
return parser

__commands__.append(('cluster_references_ani', parser_cluster_references_ani))


def skani_contigs_to_refs(inContigs, inRefs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, threads=None):

skani = assemble.skani.SkaniTool()
clusters = skani.find_reference_clusters(inRefs, threads=threads)
skani.find_closest_reference(inContigs, inRefs, out_skani_dist, threads=threads)
refs_hit = set()
refs_hit_by_cluster = set()

dist_header = util.file.readFlatFileHeader(out_skani_dist)
with open(out_skani_dist, 'r') as inf:
with open(out_skani_dist_filtered, 'w') as outf:
writer = csv.DictWriter(outf, dist_header, delimiter='\t', dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
for row in csv.DictReader(inf, delimiter='\t'):
refs_hit.add(row['Ref_file'])
if row['Ref_file'] not in refs_hit_by_cluster:
writer.writerow(row)
for cluster in clusters:
if row['Ref_file'] in cluster:
refs_hit_by_cluster.update(cluster)
break

with open(out_clusters_filtered, 'w') as outf:
for cluster in clusters:
hits = list([ref for ref in cluster if ref in refs_hit])
if hits:
outf.write('\t'.join(hits) + '\n')

def parser_skani_contigs_to_refs(parser=argparse.ArgumentParser(description='Find closest references for contigs')):
parser.add_argument('inContigs', help='FASTA file containing contigs')
parser.add_argument('inRefs', nargs='+', help='FASTA files containing reference genomes')
parser.add_argument('out_skani_dist', help='Output file containing ANI distances between contigs and references')
parser.add_argument('out_skani_dist_filtered', help='Output file containing ANI distances between contigs and references, with only the top reference hit per cluster')
parser.add_argument('out_clusters_filtered', help='Output file containing clusters of highly-related genomes, with only clusters that have a hit to the contigs')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, skani_contigs_to_refs, split_args=True)
return parser

__commands__.append(('skani_contigs_to_refs', parser_skani_contigs_to_refs))



def _order_and_orient_orig(inFasta, inReference, outFasta,
outAlternateContigs=None,
breaklen=None, # aligner='nucmer', circular=False, trimmed_contigs=None,
Expand Down
Loading