Skip to content

Commit

Permalink
Merge pull request #44 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
skani updates
  • Loading branch information
dpark01 authored Mar 28, 2024
2 parents f88c8f9 + 3aa29ec commit 6178d2e
Show file tree
Hide file tree
Showing 14 changed files with 4,588 additions and 10 deletions.
22 changes: 19 additions & 3 deletions assemble/skani.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,18 @@ def _is_fasta_basically_empty(self, inFasta, min_length=500):
return False
return True

def _sort_skani_table_by_product(self, in_tsv, out_tsv):
''' Sort the skani output tsv by the product of ANI and Total_bases_covered
'''
with open(in_tsv, 'r') as inf:
reader = csv.DictReader(inf, delimiter='\t')
sorted_rows = sorted(reader, key=lambda row: float(row['ANI']) * float(row['Total_bases_covered']), reverse=True)

with open(out_tsv, 'w') as outf:
writer = csv.DictWriter(outf, fieldnames=reader.fieldnames, delimiter='\t', dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
writer.writeheader()
writer.writerows(sorted_rows)

def execute(self, subcommand, args, outfile, threads=None):
''' generic execution of skani
'''
Expand Down Expand Up @@ -107,7 +119,7 @@ def dist(self, query_fasta, ref_fastas, outfile, other_args = (), threads=None):
self.execute('dist', ['-q', query_fasta, '-r'] + list(ref_fastas) + list(other_args), outfile, threads=threads)

def find_reference_clusters(self, ref_fastas,
m=50, s=50, c=20, min_af=15,
m=15, s=50, c=10, min_af=15,
other_args = ('--no-learned-ani', '--robust', '--detailed', '--ci', '--sparse'),
threads=None):
''' use skani triangle to define clusters of highly-related genomes
Expand Down Expand Up @@ -136,8 +148,12 @@ def find_closest_reference(self, contigs_fasta, ref_fastas, out_file,
''' use skani dist to find the closest reference genome for each contig
(default settings here are for viral genomes)
'''
self.dist(contigs_fasta, ref_fastas, out_file,
['-m', m, '-c', c, '-s', s, '--min-af', min_af] + list(other_args), threads=threads)

with util.file.tempfname('.skani_dist.tsv') as tmp_tsv:
self.dist(contigs_fasta, ref_fastas, tmp_tsv,
['-m', m, '-c', c, '-s', s, '--min-af', min_af] + list(other_args), threads=threads)
self._sort_skani_table_by_product(tmp_tsv, out_file)

with open(out_file, 'r') as inf:
top_row = None
for row in csv.DictReader(inf, delimiter='\t'):
Expand Down
12 changes: 6 additions & 6 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ def parser_gapfill_gap2seq(parser=argparse.ArgumentParser(description='Close gap
__commands__.append(('gapfill_gap2seq', parser_gapfill_gap2seq))


def cluster_references_ani(inRefs, outClusters, m=50, s=50, c=20, min_af=15, threads=None):
def cluster_references_ani(inRefs, outClusters, m=15, s=50, c=10, min_af=15, threads=None):
''' This step uses the skani triangle tool to define clusters of highly-related genomes.
'''
skani = assemble.skani.SkaniTool()
Expand All @@ -391,9 +391,9 @@ def cluster_references_ani(inRefs, outClusters, m=50, s=50, c=20, min_af=15, thr
def parser_cluster_references_ani(parser=argparse.ArgumentParser(description='Cluster references')):
parser.add_argument('inRefs', nargs='+', help='FASTA files containing reference genomes')
parser.add_argument('outClusters', help='Output file containing clusters of highly-related genomes. Each line contains the filenames of the genomes in one cluster.')
parser.add_argument('-m', type=int, default=50, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-m', type=int, default=15, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-s', type=int, default=50, help='screen out pairs with < percent identity using k-mer sketching (default: %(default)s)')
parser.add_argument('-c', type=int, default=20, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('-c', type=int, default=10, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('--min_af', dest='min_af', type=int, default=15, help='minimum alignment fraction (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, cluster_references_ani, split_args=True)
Expand All @@ -402,7 +402,7 @@ def parser_cluster_references_ani(parser=argparse.ArgumentParser(description='Cl
__commands__.append(('cluster_references_ani', parser_cluster_references_ani))


def skani_contigs_to_refs(inContigs, inRefs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, m=50, s=50, c=20, min_af=15, threads=None):
def skani_contigs_to_refs(inContigs, inRefs, out_skani_dist, out_skani_dist_filtered, out_clusters_filtered, m=15, s=50, c=10, min_af=15, threads=None):

skani = assemble.skani.SkaniTool()
clusters = skani.find_reference_clusters(inRefs, m=m, s=s, c=c, min_af=min_af, threads=threads)
Expand Down Expand Up @@ -436,9 +436,9 @@ def parser_skani_contigs_to_refs(parser=argparse.ArgumentParser(description='Fin
parser.add_argument('out_skani_dist', help='Output file containing ANI distances between contigs and references')
parser.add_argument('out_skani_dist_filtered', help='Output file containing ANI distances between contigs and references, with only the top reference hit per cluster')
parser.add_argument('out_clusters_filtered', help='Output file containing clusters of highly-related genomes, with only clusters that have a hit to the contigs')
parser.add_argument('-m', type=int, default=50, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-m', type=int, default=15, help='marker k-mer compression factor (default: %(default)s)')
parser.add_argument('-s', type=int, default=50, help='screen out pairs with < percent identity using k-mer sketching (default: %(default)s)')
parser.add_argument('-c', type=int, default=20, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('-c', type=int, default=10, help='compression factor (k-mer subsampling ratio) (default: %(default)s)')
parser.add_argument('--min_af', dest='min_af', type=int, default=15, help='minimum alignment fraction (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, skani_contigs_to_refs, split_args=True)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
>RVA_DQ473496.1_Rhinovirus_A49 (7106 bp)
TTAAAACTGGATCCAGGTTGTTCCCACCTGGATCTCCCAAAGGGAGTGGAACTCTGTTATTACGGTAACTTTGTACGCCA
GTTTTATCCTCCCCCACCCCATGTAACTTAGAAGCTTTTCACAAAGACCAATAGTTGGTAATCATCCAGATTACTTAAGG
TCAAGCACTTCTGTTTCCCCGGTCAACGTTGATATGCTCTAACAGGGCAAAAACAACTGCGATCGTTATCCGCAAAGCGC
CTACGCAAAGCCTAGTAACACCTTTGAAATTGTTTGGTTGGTCGTTCCGCCATTTTCCCTGGTAGACCTGGCAGATGAGG
CTAGAAACACCCCACTGGCGACAGTGTTCTAGCCTGCGTGGCTGCCTGCACACCCTGTGGGTGTGAAGCCAAATAATGGA
CAAGGTGTGAAGAGCCCCGTGTGCTCGCTTTGAGTCCTCCGGCCCCTGAATGCGGCTAACCTTAACCCTGCAGCTAGAGC
ACACAAGCCAGTGTGTATCTAGTCGTAATGAGTAATTGCGGGACGGGACCGACTACTTTGGGTGTCCGTGTTTCACTTTT
TACTTATAATTGCTTATGGTGACAATATATATATAGTATATATATTGACACCATGGGCGCACAGGTATCTAGGCAAAATG
TTGGAACTCATTCTACACAAAACTCCGTATCAAATGGATCTAGTTTGAACTATTTCAACATCAACTACTTTAAAGATGCT
GCTTCTAATGGCGCATCCAAATTGGAGTTTACTCAAGACCCTAGCAAATTTACAGACCCTGTTAAAGATGTATTAGAGAA
AGGAATACCAACACTACAATCACCCACAGTCGAGGCTTGTGGATATTCGGATAGAATCATACAAATCACTAGAGGAGATT
CAACAATAACTTCACAAGATGTGGCAAATGCTGTTGTTGCATATGGTGTTTGGCCACATTATCTATCCTCCAAAGATGCT
TCTGCAATTGACAAACCTTCTCGTCCGGATACATCTTCTAATAGATTTTATACTTTGAAGAGTGTGACTTGGACCAACTC
TTCAAAAGGTTGGTGGTGGAAATTGCCAGATGCACTGAGAGAAATGGGCATCTTTGGTGAAAACATGTTTTATCACTATT
TGGGCAGGAGTGGTTACACAATACATGTACAGTGCAATGCTAGCAAATTCCATCAAGGCACGTTAATCGTTGCACTAATA
CCTGAACACCAAATTGCAAGCGCCCTTCATGGTGATGTGAATGTTGGTTATAACTTTACCCATCCAGGAGAAACGGGAAG
GGAGGTCAAAAGTGACAGGAGCATGAACCTTAATTTGCAACCCACTGAAGAGTATTGGCTTAATTTTGATGGAACACTAC
TCGGAAATATTACTATATTTCCCCATCAATTTATTAACTTGAGAAGCAACAACTCTGCTACAATAATTGCTCCATATGTC
AATGCAGTCCCTATGGACTCAATGCGCAGCCACAACAACTGGAGTCTGGTGATAATCCCAATATGCCCTCTTGAAACATC
AAGTACCGTTAACACAATACCTATCACAATATCAATAAGTCCCATGTGTGCAGAGTTCTCTGGTGCGCGTGCTAAGAGTC
AGGGTTTACCGGTTTTCATCACGCCAGGGTCAGGCCAGTTTCTGACAACAGATGACTTTCAATCCCCATGTGCACTCCCT
TGGTATCACCCAACCAAAGAAATCTCTATTCCAGGTGAGGTAAAAAATTTAGTTGAAATATGTCAAGTAGATAGTTTAGT
GCCAATAAATAATACTGAAAATAACATCACTAATGAAAACATGTATTCGGTTGTGCTGCAATCAACAGTTGGAGAACCAG
ATAAAATCTTTTCTATCAAAACAGATGTAGCCTCTCAGCCATTAGCTACTACTCTAATTGGTGAAATATCCAGTTATTAT
ACTCACTGGACAGGAAGTCTCCGCTTCAGCTTCATGTTCTGTGGCACAGCCAATACCACTGTTAAACTCTTGTTAGCCTA
TACGCCACCTGGCATCGCAGAACCTACTACAAGAAAGGAAGCAATGTTAGGTACTCATGTAATATGGGATGTAGGTCTTC
AATCTACAATATCAATGGTGGTACCATGGATCAGTGCTAGTCACTACAGGAACACATCACCAGGTACATCTACATCTGGA
TACATAACATGCTGGTATCAAACTAGGCTAGTAATTCCACCTCAAACTCCAGCCACAGCTAGGTTATTATGTTTTGTATC
AGGCTGTAAAGATTTTTGTTTACGCATGGCTCGGGACACTAATTTGCATTTGCAGAGTGGGGTAATAGAACAAAATCCTG
TTGAACACTACATAGATGAGGTCCTTAATGAGGTTTTAGTTGTTCCAAATATTAATAGTAGTCACCCCACGACATCAAAC
TCTGCTCCAGCATTAGATGCTGCAGAAACAGGGCACACTAGCAATGTGCAACCTGAAGATGTCATTGAAACCAGATATGT
ACAGACATCACAAACAAGAGATGAAATGAGTTTAGAGAGTTTTCTTGGTAGGTCAGGGTGTATACATGAATCCAAACTAG
AGGTCACACTTACAAATTACAATGAAAATAATTTCAAAGTATGGAACATCAATTTGCAAGAAATGGCCCAGATCAGAAGA
AAATTTGAACTGTTTACTTACACTAGATTCGATTCTGAAATAACCTTGGTTCCATGCATTTCTGCACTTAGCAAGGATAT
TGGACACATTACAATGCAATACATGTATGTGCCGCCAGGTGCACCTGTACCAAAGAGCAGAGATGATTATGCATGGCAGT
CTGGCACTAATGCATCTGTTTTCTGGCAACATGGGCAAGCATACCCAAGATTTTCTCTACCCTTTTTGAGTGTAGCTTCA
GCTTACTACATGTTTTATGATGGATATAATGAACAGGGCCAAAATTATGGTACGGTAAGTACAAACAACATGGGATCATT
ATGCTCTAGGATAGTAACAGAGAAACACATTCACAGTATGCATATCATGACAAGAATCTATCATAAAGCTAAACACGTCA
AAGCATGGTGTCCGCGCCCACCCAGAGCACTTGAATATACTCGCGCTCACCGTACTAATTTCAAAGTTGAAGACAGAGAC
ATTAAAACAGGAATCACATCCAGAGCAATTATTACAACAGCTGGTCCTAGTGATATGTATGTTCATGTTGGTAATCTTAT
CTATAGAAATTTACATCTCTTCAATTCTGAAATGCATGAATCTATACTGGTATCTTATTCATCAGATTTAATCATTTACC
GAACAAACACCATAGGTGATGATTATATCCCCTCTTGTGATTGCACCCAGGCTACATATTACTGTAAACATAAAAATAGA
TATTTCCCAATCACAGTGACAAGCCATGACTGGTATGAAATACAGGAAAGTGAATACTACCCCAAGCATATACAATACAA
TTTACTAATTGGTGAGGGCCCTTGTGAACCAGGTGATTGTGGTGGGAAATTGTTATGTAAACATGGTGTTATTGGTATAG
TAACAGCTGGTGGTGATAATCATGTGGCTTTTATTGATCTTAGACACTTTCACTGTGCTGAGGAGCAAGGAATTACAGAT
TACATACACATGTTAGGTGAAGCATTTGGAAATGGATTTGTAGATAGTGTAAAAGAACATATACATGCCATTAATCCAGT
AGGAAATGTTAGTAAAAAGATCATCAAATGGATGCTGAGGATAATATCAGCAATGGTTATAATAATAAGGAATTCCTCTG
ATCCACAAACTATATTGGCAACACTCACGCTAATTGGATGTTCTGGTTCACCATGGAGATTTTTAAAGGAGAAATTTTGC
AAATGGACACAACTCAATTATATACATAAAGAATCAGACTCATGGTTGAAAAAATTTACTGAAGCATGCAATGCAGCCAG
AGGACTTGAGTGGATAGGGAATAAAATATCTAAATTCATTGAATGGATGAAATCAATGCTCCCACAAGCACAACTTAAGG
TCAAGTACTTAAATGAACTCAAAAAATTGAGCTTGTATGAAAAACAAGTTGAAAGTCTTAGAGTAGCAGATGTAAGAACA
CAAGAAAAGATTAAAATGGAAATTGATACCCTACATGACTTATCATGCAAATTTTTACCATTGTATGCAGGTGAAGCAAA
AAGGATAAAGACGTTACATATTAAGTGTGATAACATTATCAAACAAAAGAAAAGGTGTGAACCAGTGGCTATAGTTATCC
ACGGGCCACCTGGTGCTGGAAAATCAATAACAACAAATTTTTTAGCTAGGATGATAACTAATGATAGTGATATATACTCT
TTACCCCCAGATCCAAAATATTTTGATGGCTACGATCAACAAAGTGTGGTGATAATGGATGATATCATGCAAAATCCAGC
TGGGGATGACATGACTCTATTCTGTCAAATGGTTTCCAGTGTCACGTTCATACCACCAATGGCTGATTTACCAGATAAGG
GCAAGGCCTTTGACTCTAGGTTTGTATTATGCAGCACAAATCACTCTCTTTTAGCACCGCCTACAATAACCTCACTGCCT
GCTATGAACAGAAGATTCTTTTTAGACTTAGACATAATAGTACATGATAATTTCAAGGATTCACAGGGTAAACTTAATGT
AGCAGCAGCCTTCCAACCATGTAATGTGGACAATAAAATTGGGAATGCTCGTTGTTGCCCATTTGTGTGTGGAAAGGCTT
TTTCTTTCAAAGATCGCAATTCTTGCAATAAATACACCCTAGCACAAGTGTACAACATAATGCTTGAAGAAGACAAACGG
AGGAGACAAGTGGTTGACGTCATGTCGGCCATATTCCAAGGACCAATTGACATGAAAAACCCACCACCACCTGCAATTAC
TGATTTACTCCAGTCAGTTAGGACTCCTGAGGTTATCAAGTATTGTGAAGTTAATAGATGGATAATTCCAGCAGAATGTA
AAATAGAGAAGGAATTAAATTTAGCCAATACAATCATAACAATCATTGCCAATGTTATTAGTATAGCAGGTATTATATAC
GTCATCTACAAACTCTTCTGTACACTACAAGGACCATATTCAGGGGAACCTAAACCTAAAACTAAAGTCCCAGAAAGACG
TGTTGTGGCACAGGGGCCAGAGGAAGAATTTGGAATGTCTTTGATTAAGCACAACTCATGTGTGATTACAACAGAGAATG
GAAAGTTTACAGGCCTTGGGATATATGACAAGTTTGTGGTTGTACCAACACATGCAGATCCTGGGAAAGAAATTCAGGTT
GACGGCATAACTACAAAAGTTGCTGACTCATATGATTTATATAATAAAGATGGAATAAAGTTAGAAATAACAGTCTTAAA
ATTAGATAGGAATGAAAAGTTCAGAGACATTAGAAAATACATACCAAATAATGAGGATGACTACCCTGACTGCAACTTAG
CCTTGTTAGCAAATCAACCTGAACCCACTATAATCAATGTTGGAGATGTTATATCCTATGGCAACATACTGCTTAGTGGT
AACCAGACAGCTAGAATGCTCAAATACAGCTATCCAACTAAATCTGGGTACTGTGGGGGTGTTTTGTATAAAATTGGGCA
AGTGCTTGGAATACACGTAGGAGGAAATGGTAGAGATGGATTTTCAGCTATGTTGCTTAGATCTTACTTTACTGATGTGC
AAGGCCAAATAATCTCATCAAAGAAAACTAGTGAATGTAATCTTCCCAGTATACATAATCCATCCAAAACAAAATTGCAG
CCCAGTGTCTTCTATGATGTTTTCCCTGGTTCAAAAGAACCAGCTGTGTTGTCTGAGAAAGATACACGGCTTCAGGTTGA
CTTTAATGAAGCATTATTTTCTAAATACAAAGGAAACACAAACTGCTTTATGAATGATCACATAAGGATTGCATCTTCAC
ACTATGCAGCTCAATTAATCACTTTAGATATTGACCCAAGTCCTATCACCCTTGAAGATAGTGTTTTTGGTACTGATGGA
TTAGAGGCTCTTGACCTAAATACTAGTGCAGGATTTCCTTACATTACAATGGGGATAAAGAAGAGAGATTTAATAAATAA
CAAAACCAAAGACATAAGTAAACTTAGGGAAGCAATTGACAAGTATGGAGTTGACTTGCCTATGGTTACTTTCTTAAAAG
ATGAACTTAGAAAACATGAGAAGATAATCAAAGGCAAAACTAGAGTTATTGAAGCTAGTAGTGTGAATGATACTTTGCTG
TTCAGAACAACTTTTGGGAATCTTTTTTCAAAATTTCATCTAAATCCTGGAATTATCACTGGATCAGCAGTCGGGTGTGA
CCCAGAAACTTTCTGGTCAAAGATACCAGCAATGTTAGATGATAAGTGTATTATGGCTTTTGATTATACAAATTATGATG
GTAGTATACACCCTATTTGGTTTGAAGCCCTTAAACAAGTATTGATGGATTTATCCTTTGATCCAATGTTAATAGATAGA
TTATGCAAATCCAAACACATCTTTAGAAACACATACTATGAGGTTGAAGGAGGTGTTCCATCTGGATGTTCAGGTACTAG
TATTTTCAACACTATGATTAATAATATTATTATAAGGACCTTAGTGTTAGATGCATATAAAAATATAGATTTAGACAAAC
TTAAGATAATTGCCTATGGTGATGATGTTATATTTTCATACACACATGAATTGGACATGGAAGCTATAGCAATAGAGGGT
ATTAAATATGGTCTAACCATAACCCCTGCTGACAAATCTACCACTTTTGTAAAGTTAAATTACAATAATGTCACCTTTTT
GAAAAGAGGATTCAAACAAGATGAGAAATACAACTTCTTGATACACCCCACCTTTCCTGAGAGTGAAATATTTGAATCTA
TCAGGTGGACAAAGAAACCTTCACAGATGCACGAGCACGTACTATCTCTGTGTCGCTTAATGTGGCACAATGGGCGTGAT
GCGTATGAAAAGTTTGTGGAGAAGATACGCAGTGTGAGCGCTGGTCGCGCACTGTACATCCCTCCATATAACTTGCTTTT
GCATGAGTGGTATGAAAAATTTTAAACTGATATAGAAATATTAAACTGTTAGTTTATTAGTTTTAC
Loading

0 comments on commit 6178d2e

Please sign in to comment.