Skip to content

Commit

Permalink
add assembly_summary that triggered the problems
Browse files Browse the repository at this point in the history
  • Loading branch information
luizirber committed Aug 14, 2020
1 parent ca85969 commit 5825f3f
Show file tree
Hide file tree
Showing 2 changed files with 65 additions and 8 deletions.
73 changes: 65 additions & 8 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,39 @@ from pathlib import Path
SBT_ZIP = Path("/home/luizirber/work/sourmash-bio/sourmash_resources_debug/genbank-bacteria-x1e6-k51.sbt.zip")

rule all:
input: "outputs/newtree.sbt.zip"
input:
"outputs/signames_new_catalog.txt",
"outputs/signames_new.txt"

rule extract_sig_names:
output: "outputs/signames.txt"
input: SBT_ZIP
run:
def extract(inp, out):
import sourmash

index = sourmash.load_file_as_index(input[0])
with open(output[0], 'w') as out:
index = sourmash.load_file_as_index(inp)
with open(out, 'w') as out:
for leaf in index.leaves():
sig = leaf.data
out.write(sig.name() + "\n")
leaf.unload()

rule build_sbt:
rule extract_sig_names:
output: "outputs/signames.txt"
input: SBT_ZIP
run:
extract(input[0], output[0])

rule extract_sig_names_new_sbt:
output: "outputs/signames_new.txt"
input: "outputs/newtree.sbt.zip"
run:
extract(input[0], output[0])

rule extract_sig_names_new_sbt_catalog:
output: "outputs/signames_new_catalog.txt"
input: "outputs/newtree_catalog.sbt.zip"
run:
extract(input[0], output[0])

rule build_sbt_from_set:
output: "outputs/newtree.sbt.zip"
input: "outputs/signames.txt"
run:
Expand All @@ -38,3 +55,43 @@ rule build_sbt:
print(f"processed {i} sigs")

index.save(output[0])

rule build_sbt_from_catalog:
output: "outputs/newtree_catalog.sbt.zip"
input: "outputs/catalog.txt"
run:
import sourmash

index = sourmash.create_sbt_index(bloom_filter_size=64)
signames = []
with open(input[0], 'r') as f:
for line in f:
signames.append(line.strip())

for i, signame in enumerate(signames):
mh = sourmash.MinHash(0, 31, scaled=2000, mins=[i])
new_sig = sourmash.SourmashSignature(mh, name=signame)
index.insert(new_sig)
if i % 1000 == 0:
print(f"processed {i} sigs")

index.save(output[0])

rule generate_catalog:
output: "outputs/catalog.txt"
input: "outputs/assembly_summary_genbank.txt.gz"
run:
import csv
import gzip

with open(output[0], 'w') as out:
with gzip.open(input[0], 'rt') as fp:
fp.readline() # skip first line
fp.read(2) # skip initial comment in header
data = csv.DictReader(fp, delimiter='\t')
for row in data:
name_parts = [row["assembly_accession"], " ", row['organism_name']]
if row['infraspecific_name']:
name_parts += [" ", row['infraspecific_name']]
name_parts += [', ', row['asm_name']]
out.write("".join(name_parts) + "\n")
Binary file added outputs/assembly_summary_genbank.txt.gz
Binary file not shown.

0 comments on commit 5825f3f

Please sign in to comment.