Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

skani updates #44

Merged
merged 8 commits into from
Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions assemble/skani.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ def _is_fasta_basically_empty(self, inFasta, min_length=500):
return False
return True

def _sort_skani_table_by_product(self, in_tsv, out_tsv):
''' Sort the skani output tsv by the product of ANI and Total_bases_covered
'''
with open(in_tsv, 'r') as inf:
reader = csv.DictReader(inf, delimiter='\t')
sorted_rows = sorted(reader, key=lambda row: float(row['ANI']) * float(row['Total_bases_covered']), reverse=True)

with open(out_tsv, 'w') as outf:
writer = csv.DictWriter(outf, fieldnames=reader.fieldnames, delimiter='\t', dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
writer.writerows(sorted_rows)

def execute(self, subcommand, args, outfile, threads=None):
''' generic execution of skani
'''
Expand Down Expand Up @@ -107,7 +119,7 @@ def dist(self, query_fasta, ref_fastas, outfile, other_args = (), threads=None):
self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads)

def find_reference_clusters(self, ref_fastas,
m=50, s=50, c=20, min_af=15,
m=15, s=50, c=10, min_af=15,
other_args = ('--no-learned-ani', '--robust', '--detailed', '--ci', '--sparse'),
threads=None):
''' use skani triangle to define clusters of highly-related genomes
Expand Down Expand Up @@ -136,8 +148,12 @@ def find_closest_reference(self, contigs_fasta, ref_fastas, out_file,
''' use skani dist to find the closest reference genome for each contig
(default settings here are for viral genomes)
'''
self.dist(contigs_fasta, ref_fastas, out_file,
['-m', m, '-c', c, '-s', s, '--min-af', min_af] + list(other_args), threads=threads)

with util.file.tempfname('.skani_dist.tsv') as tmp_tsv:
self.dist(contigs_fasta, ref_fastas, tmp_tsv,
['-m', m, '-c', c, '-s', s, '--min-af', min_af] + list(other_args), threads=threads)
self._sort_skani_table_by_product(tmp_tsv, out_file)

with open(out_file, 'r') as inf:
top_row = None
for row in csv.DictReader(inf, delimiter='\t'):
Expand Down
12 changes: 6 additions & 6 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gap
__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq))


def cluster_references_ani(inRefs, outClusters, m=50, s=50, c=20, min_af=15, threads=None):
def cluster_references_ani(inRefs, outClusters, m=15, s=50, c=10, min_af=15, threads=None):
''' This step uses the skani triangle tool to define clusters of highly-related genomes.
'''
skani = assemble.skani.SkaniTool()
Expand All @@ -391,9 +391,9 @@ def cluster_references_ani(inRefs, outClusters, m=50, s=50, c=20, min_af=15, thr
def parser_cluster_references_ani(parser=argparse.ArgumentParser(description='Cluster references')):
parser.add_argument('inRefs', nargs='+', help='FASTA files containing reference genomes')
parser.add_argument('outClusters', help='Output file containing clusters of highly-related genomes. Each line contains the filenames of the genomes in one cluster.')
parser.add_argument('-m', type=int, default=50, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-m', type=int, default=15, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-s', type=int, default=50, help='screen out pairs with < percent identity using k-mer sketching (default: %(default)s)')
parser.add_argument('-c', type=int, default=20, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('-c', type=int, default=10, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('--min_af', dest='min_af', type=int, default=15, help='minimum alignment fraction (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, cluster_references_ani, split_args=True)
Expand All @@ -402,7 +402,7 @@ def parser_cluster_references_ani(parser=argparse.ArgumentParser(description='Cl
__commands__.append(('cluster_references_ani', parser_cluster_references_ani))


def skani_contigs_to_refs(inContigs, inRefs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, m=50, s=50, c=20, min_af=15, threads=None):
def skani_contigs_to_refs(inContigs, inRefs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, m=15, s=50, c=10, min_af=15, threads=None):

skani = assemble.skani.SkaniTool()
clusters = skani.find_reference_clusters(inRefs, m=m, s=s, c=c, min_af=min_af, threads=threads)
Expand Down Expand Up @@ -436,9 +436,9 @@ def parser_skani_contigs_to_refs(parser=argparse.ArgumentParser(description='Fin
parser.add_argument('out_skani_dist', help='Output file containing ANI distances between contigs and references')
parser.add_argument('out_skani_dist_filtered', help='Output file containing ANI distances between contigs and references, with only the top reference hit per cluster')
parser.add_argument('out_clusters_filtered', help='Output file containing clusters of highly-related genomes, with only clusters that have a hit to the contigs')
parser.add_argument('-m', type=int, default=50, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-m', type=int, default=15, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-s', type=int, default=50, help='screen out pairs with < percent identity using k-mer sketching (default: %(default)s)')
parser.add_argument('-c', type=int, default=20, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('-c', type=int, default=10, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('--min_af', dest='min_af', type=int, default=15, help='minimum alignment fraction (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, skani_contigs_to_refs, split_args=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
>RVA_DQ473496.1_Rhinovirus_A49 (7106 bp)
TTAAAACTGGATCCAGGTTGTTCCCACCTGGATCTCCCAAAGGGAGTGGAACTCTGTTATTACGGTAACTTTGTACGCCA
GTTTTATCCTCCCCCACCCCATGTAACTTAGAAGCTTTTCACAAAGACCAATAGTTGGTAATCATCCAGATTACTTAAGG
TCAAGCACTTCTGTTTCCCCGGTCAACGTTGATATGCTCTAACAGGGCAAAAACAACTGCGATCGTTATCCGCAAAGCGC
CTACGCAAAGCCTAGTAACACCTTTGAAATTGTTTGGTTGGTCGTTCCGCCATTTTCCCTGGTAGACCTGGCAGATGAGG
CTAGAAACACCCCACTGGCGACAGTGTTCTAGCCTGCGTGGCTGCCTGCACACCCTGTGGGTGTGAAGCCAAATAATGGA
CAAGGTGTGAAGAGCCCCGTGTGCTCGCTTTGAGTCCTCCGGCCCCTGAATGCGGCTAACCTTAACCCTGCAGCTAGAGC
ACACAAGCCAGTGTGTATCTAGTCGTAATGAGTAATTGCGGGACGGGACCGACTACTTTGGGTGTCCGTGTTTCACTTTT
TACTTATAATTGCTTATGGTGACAATATATATATAGTATATATATTGACACCATGGGCGCACAGGTATCTAGGCAAAATG
TTGGAACTCATTCTACACAAAACTCCGTATCAAATGGATCTAGTTTGAACTATTTCAACATCAACTACTTTAAAGATGCT
GCTTCTAATGGCGCATCCAAATTGGAGTTTACTCAAGACCCTAGCAAATTTACAGACCCTGTTAAAGATGTATTAGAGAA
AGGAATACCAACACTACAATCACCCACAGTCGAGGCTTGTGGATATTCGGATAGAATCATACAAATCACTAGAGGAGATT
CAACAATAACTTCACAAGATGTGGCAAATGCTGTTGTTGCATATGGTGTTTGGCCACATTATCTATCCTCCAAAGATGCT
TCTGCAATTGACAAACCTTCTCGTCCGGATACATCTTCTAATAGATTTTATACTTTGAAGAGTGTGACTTGGACCAACTC
TTCAAAAGGTTGGTGGTGGAAATTGCCAGATGCACTGAGAGAAATGGGCATCTTTGGTGAAAACATGTTTTATCACTATT
TGGGCAGGAGTGGTTACACAATACATGTACAGTGCAATGCTAGCAAATTCCATCAAGGCACGTTAATCGTTGCACTAATA
CCTGAACACCAAATTGCAAGCGCCCTTCATGGTGATGTGAATGTTGGTTATAACTTTACCCATCCAGGAGAAACGGGAAG
GGAGGTCAAAAGTGACAGGAGCATGAACCTTAATTTGCAACCCACTGAAGAGTATTGGCTTAATTTTGATGGAACACTAC
TCGGAAATATTACTATATTTCCCCATCAATTTATTAACTTGAGAAGCAACAACTCTGCTACAATAATTGCTCCATATGTC
AATGCAGTCCCTATGGACTCAATGCGCAGCCACAACAACTGGAGTCTGGTGATAATCCCAATATGCCCTCTTGAAACATC
AAGTACCGTTAACACAATACCTATCACAATATCAATAAGTCCCATGTGTGCAGAGTTCTCTGGTGCGCGTGCTAAGAGTC
AGGGTTTACCGGTTTTCATCACGCCAGGGTCAGGCCAGTTTCTGACAACAGATGACTTTCAATCCCCATGTGCACTCCCT
TGGTATCACCCAACCAAAGAAATCTCTATTCCAGGTGAGGTAAAAAATTTAGTTGAAATATGTCAAGTAGATAGTTTAGT
GCCAATAAATAATACTGAAAATAACATCACTAATGAAAACATGTATTCGGTTGTGCTGCAATCAACAGTTGGAGAACCAG
ATAAAATCTTTTCTATCAAAACAGATGTAGCCTCTCAGCCATTAGCTACTACTCTAATTGGTGAAATATCCAGTTATTAT
ACTCACTGGACAGGAAGTCTCCGCTTCAGCTTCATGTTCTGTGGCACAGCCAATACCACTGTTAAACTCTTGTTAGCCTA
TACGCCACCTGGCATCGCAGAACCTACTACAAGAAAGGAAGCAATGTTAGGTACTCATGTAATATGGGATGTAGGTCTTC
AATCTACAATATCAATGGTGGTACCATGGATCAGTGCTAGTCACTACAGGAACACATCACCAGGTACATCTACATCTGGA
TACATAACATGCTGGTATCAAACTAGGCTAGTAATTCCACCTCAAACTCCAGCCACAGCTAGGTTATTATGTTTTGTATC
AGGCTGTAAAGATTTTTGTTTACGCATGGCTCGGGACACTAATTTGCATTTGCAGAGTGGGGTAATAGAACAAAATCCTG
TTGAACACTACATAGATGAGGTCCTTAATGAGGTTTTAGTTGTTCCAAATATTAATAGTAGTCACCCCACGACATCAAAC
TCTGCTCCAGCATTAGATGCTGCAGAAACAGGGCACACTAGCAATGTGCAACCTGAAGATGTCATTGAAACCAGATATGT
ACAGACATCACAAACAAGAGATGAAATGAGTTTAGAGAGTTTTCTTGGTAGGTCAGGGTGTATACATGAATCCAAACTAG
AGGTCACACTTACAAATTACAATGAAAATAATTTCAAAGTATGGAACATCAATTTGCAAGAAATGGCCCAGATCAGAAGA
AAATTTGAACTGTTTACTTACACTAGATTCGATTCTGAAATAACCTTGGTTCCATGCATTTCTGCACTTAGCAAGGATAT
TGGACACATTACAATGCAATACATGTATGTGCCGCCAGGTGCACCTGTACCAAAGAGCAGAGATGATTATGCATGGCAGT
CTGGCACTAATGCATCTGTTTTCTGGCAACATGGGCAAGCATACCCAAGATTTTCTCTACCCTTTTTGAGTGTAGCTTCA
GCTTACTACATGTTTTATGATGGATATAATGAACAGGGCCAAAATTATGGTACGGTAAGTACAAACAACATGGGATCATT
ATGCTCTAGGATAGTAACAGAGAAACACATTCACAGTATGCATATCATGACAAGAATCTATCATAAAGCTAAACACGTCA
AAGCATGGTGTCCGCGCCCACCCAGAGCACTTGAATATACTCGCGCTCACCGTACTAATTTCAAAGTTGAAGACAGAGAC
ATTAAAACAGGAATCACATCCAGAGCAATTATTACAACAGCTGGTCCTAGTGATATGTATGTTCATGTTGGTAATCTTAT
CTATAGAAATTTACATCTCTTCAATTCTGAAATGCATGAATCTATACTGGTATCTTATTCATCAGATTTAATCATTTACC
GAACAAACACCATAGGTGATGATTATATCCCCTCTTGTGATTGCACCCAGGCTACATATTACTGTAAACATAAAAATAGA
TATTTCCCAATCACAGTGACAAGCCATGACTGGTATGAAATACAGGAAAGTGAATACTACCCCAAGCATATACAATACAA
TTTACTAATTGGTGAGGGCCCTTGTGAACCAGGTGATTGTGGTGGGAAATTGTTATGTAAACATGGTGTTATTGGTATAG
TAACAGCTGGTGGTGATAATCATGTGGCTTTTATTGATCTTAGACACTTTCACTGTGCTGAGGAGCAAGGAATTACAGAT
TACATACACATGTTAGGTGAAGCATTTGGAAATGGATTTGTAGATAGTGTAAAAGAACATATACATGCCATTAATCCAGT
AGGAAATGTTAGTAAAAAGATCATCAAATGGATGCTGAGGATAATATCAGCAATGGTTATAATAATAAGGAATTCCTCTG
ATCCACAAACTATATTGGCAACACTCACGCTAATTGGATGTTCTGGTTCACCATGGAGATTTTTAAAGGAGAAATTTTGC
AAATGGACACAACTCAATTATATACATAAAGAATCAGACTCATGGTTGAAAAAATTTACTGAAGCATGCAATGCAGCCAG
AGGACTTGAGTGGATAGGGAATAAAATATCTAAATTCATTGAATGGATGAAATCAATGCTCCCACAAGCACAACTTAAGG
TCAAGTACTTAAATGAACTCAAAAAATTGAGCTTGTATGAAAAACAAGTTGAAAGTCTTAGAGTAGCAGATGTAAGAACA
CAAGAAAAGATTAAAATGGAAATTGATACCCTACATGACTTATCATGCAAATTTTTACCATTGTATGCAGGTGAAGCAAA
AAGGATAAAGACGTTACATATTAAGTGTGATAACATTATCAAACAAAAGAAAAGGTGTGAACCAGTGGCTATAGTTATCC
ACGGGCCACCTGGTGCTGGAAAATCAATAACAACAAATTTTTTAGCTAGGATGATAACTAATGATAGTGATATATACTCT
TTACCCCCAGATCCAAAATATTTTGATGGCTACGATCAACAAAGTGTGGTGATAATGGATGATATCATGCAAAATCCAGC
TGGGGATGACATGACTCTATTCTGTCAAATGGTTTCCAGTGTCACGTTCATACCACCAATGGCTGATTTACCAGATAAGG
GCAAGGCCTTTGACTCTAGGTTTGTATTATGCAGCACAAATCACTCTCTTTTAGCACCGCCTACAATAACCTCACTGCCT
GCTATGAACAGAAGATTCTTTTTAGACTTAGACATAATAGTACATGATAATTTCAAGGATTCACAGGGTAAACTTAATGT
AGCAGCAGCCTTCCAACCATGTAATGTGGACAATAAAATTGGGAATGCTCGTTGTTGCCCATTTGTGTGTGGAAAGGCTT
TTTCTTTCAAAGATCGCAATTCTTGCAATAAATACACCCTAGCACAAGTGTACAACATAATGCTTGAAGAAGACAAACGG
AGGAGACAAGTGGTTGACGTCATGTCGGCCATATTCCAAGGACCAATTGACATGAAAAACCCACCACCACCTGCAATTAC
TGATTTACTCCAGTCAGTTAGGACTCCTGAGGTTATCAAGTATTGTGAAGTTAATAGATGGATAATTCCAGCAGAATGTA
AAATAGAGAAGGAATTAAATTTAGCCAATACAATCATAACAATCATTGCCAATGTTATTAGTATAGCAGGTATTATATAC
GTCATCTACAAACTCTTCTGTACACTACAAGGACCATATTCAGGGGAACCTAAACCTAAAACTAAAGTCCCAGAAAGACG
TGTTGTGGCACAGGGGCCAGAGGAAGAATTTGGAATGTCTTTGATTAAGCACAACTCATGTGTGATTACAACAGAGAATG
GAAAGTTTACAGGCCTTGGGATATATGACAAGTTTGTGGTTGTACCAACACATGCAGATCCTGGGAAAGAAATTCAGGTT
GACGGCATAACTACAAAAGTTGCTGACTCATATGATTTATATAATAAAGATGGAATAAAGTTAGAAATAACAGTCTTAAA
ATTAGATAGGAATGAAAAGTTCAGAGACATTAGAAAATACATACCAAATAATGAGGATGACTACCCTGACTGCAACTTAG
CCTTGTTAGCAAATCAACCTGAACCCACTATAATCAATGTTGGAGATGTTATATCCTATGGCAACATACTGCTTAGTGGT
AACCAGACAGCTAGAATGCTCAAATACAGCTATCCAACTAAATCTGGGTACTGTGGGGGTGTTTTGTATAAAATTGGGCA
AGTGCTTGGAATACACGTAGGAGGAAATGGTAGAGATGGATTTTCAGCTATGTTGCTTAGATCTTACTTTACTGATGTGC
AAGGCCAAATAATCTCATCAAAGAAAACTAGTGAATGTAATCTTCCCAGTATACATAATCCATCCAAAACAAAATTGCAG
CCCAGTGTCTTCTATGATGTTTTCCCTGGTTCAAAAGAACCAGCTGTGTTGTCTGAGAAAGATACACGGCTTCAGGTTGA
CTTTAATGAAGCATTATTTTCTAAATACAAAGGAAACACAAACTGCTTTATGAATGATCACATAAGGATTGCATCTTCAC
ACTATGCAGCTCAATTAATCACTTTAGATATTGACCCAAGTCCTATCACCCTTGAAGATAGTGTTTTTGGTACTGATGGA
TTAGAGGCTCTTGACCTAAATACTAGTGCAGGATTTCCTTACATTACAATGGGGATAAAGAAGAGAGATTTAATAAATAA
CAAAACCAAAGACATAAGTAAACTTAGGGAAGCAATTGACAAGTATGGAGTTGACTTGCCTATGGTTACTTTCTTAAAAG
ATGAACTTAGAAAACATGAGAAGATAATCAAAGGCAAAACTAGAGTTATTGAAGCTAGTAGTGTGAATGATACTTTGCTG
TTCAGAACAACTTTTGGGAATCTTTTTTCAAAATTTCATCTAAATCCTGGAATTATCACTGGATCAGCAGTCGGGTGTGA
CCCAGAAACTTTCTGGTCAAAGATACCAGCAATGTTAGATGATAAGTGTATTATGGCTTTTGATTATACAAATTATGATG
GTAGTATACACCCTATTTGGTTTGAAGCCCTTAAACAAGTATTGATGGATTTATCCTTTGATCCAATGTTAATAGATAGA
TTATGCAAATCCAAACACATCTTTAGAAACACATACTATGAGGTTGAAGGAGGTGTTCCATCTGGATGTTCAGGTACTAG
TATTTTCAACACTATGATTAATAATATTATTATAAGGACCTTAGTGTTAGATGCATATAAAAATATAGATTTAGACAAAC
TTAAGATAATTGCCTATGGTGATGATGTTATATTTTCATACACACATGAATTGGACATGGAAGCTATAGCAATAGAGGGT
ATTAAATATGGTCTAACCATAACCCCTGCTGACAAATCTACCACTTTTGTAAAGTTAAATTACAATAATGTCACCTTTTT
GAAAAGAGGATTCAAACAAGATGAGAAATACAACTTCTTGATACACCCCACCTTTCCTGAGAGTGAAATATTTGAATCTA
TCAGGTGGACAAAGAAACCTTCACAGATGCACGAGCACGTACTATCTCTGTGTCGCTTAATGTGGCACAATGGGCGTGAT
GCGTATGAAAAGTTTGTGGAGAAGATACGCAGTGTGAGCGCTGGTCGCGCACTGTACATCCCTCCATATAACTTGCTTTT
GCATGAGTGGTATGAAAAATTTTAAACTGATATAGAAATATTAAACTGTTAGTTTATTAGTTTTAC
Loading