diff --git a/db/default_template.docx b/db/default_template.docx index 272ea81..6ad3ba9 100644 Binary files a/db/default_template.docx and b/db/default_template.docx differ diff --git a/tb-profiler b/tb-profiler index f1d421a..883b8a7 100644 --- a/tb-profiler +++ b/tb-profiler @@ -1,7 +1,7 @@ #! /usr/bin/env python3 import sys import pathogenprofiler as pp -from pathogenprofiler import TempFilePrefix +from pathogenprofiler import TempFilePrefix, TempFolder import pathogenprofiler.variant_calling as vc import argparse from rich_argparse import ArgumentDefaultsRichHelpFormatter @@ -17,6 +17,7 @@ from joblib import Parallel, delayed from tqdm import tqdm import logging from rich.logging import RichHandler +from copy import deepcopy import importlib import pkgutil @@ -81,18 +82,28 @@ contents of the error log (%s) ############################################################################### """ % (outfile)) +def vcf_job(args: argparse.Namespace,sample_name: str): + # logging.info(f"\nExtracting variants and running pipeline for {sample_name}\n") + copy_of_args = deepcopy(args) + copy_of_args.prefix = sample_name + with TempFolder() as tmpfolder: + cmd = f"bcftools view -s {sample_name} -ac 1 {args.vcf} | bcftools +fixploidy -Oz -o {tmpfolder}/{args.vcf} && bcftools index {tmpfolder}/{args.vcf} " % vars(copy_of_args) + pp.run_cmd(cmd) + os.chdir(tmpfolder) + copy_of_args.files_prefix = os.path.abspath(f"{os.getcwd()}/{copy_of_args.prefix}") + main_profile(copy_of_args) + os.chdir('../') def multisample_vcf_run(args): vcf_obj = pp.Vcf(args.vcf) - args.original_vcf = args.vcf + jobs = [] + for sample_name in vcf_obj.samples: - logging.info(f"\nExtracting variants and running pipeline for {sample_name}\n") - args.prefix = sample_name - args.tmp_vcf = "%s.%s.vcf.gz" % (args.files_prefix,uuid4()) - pp.run_cmd("bcftools view -s %(prefix)s -ac 1 %(original_vcf)s | bcftools +fixploidy -Oz -o %(tmp_vcf)s && bcftools index %(tmp_vcf)s " % vars(args)) - args.vcf = args.tmp_vcf - main_profile(args) + jobs.append((args,sample_name)) + + parallel = Parallel(n_jobs=args.threads, return_as='generator') + [r for r in tqdm(parallel(delayed(vcf_job)(cmd[0],cmd[1]) for cmd in jobs),total=len(jobs),desc="Running jobs")] def create_output_directories(args,directories=["bam","vcf","results"]): if pp.nofolder(args.dir): diff --git a/tbprofiler/__init__.py b/tbprofiler/__init__.py index 581af2b..f78cb59 100644 --- a/tbprofiler/__init__.py +++ b/tbprofiler/__init__.py @@ -8,7 +8,7 @@ from .docx import * from abc import ABC, abstractmethod -__version__ = "6.4.1" +__version__ = "6.5.0" class ProfilePlugin: diff --git a/tbprofiler/consensus.py b/tbprofiler/consensus.py index 27c6844..2bc8fc1 100644 --- a/tbprofiler/consensus.py +++ b/tbprofiler/consensus.py @@ -70,7 +70,8 @@ def get_consensus_vcf(sample: str,input_vcf: str,args: argparse.Namespace) -> st tmp_aln = str(uuid4()) run_cmd(f"cat {args.conf['ref']} {consensus_file}> {tmp_aln}") outfile = f"{args.files_prefix}.masked.vcf" - run_cmd(f"faToVcf -includeNoAltN {tmp_aln} {outfile}") - os.remove(tmp_aln) + run_cmd(f"fa2vcf.py {tmp_aln} {outfile}") + run_cmd(f'rm {tmp_aln} {tmp_aln}.fai') + return outfile diff --git a/tbprofiler/snp_dists.py b/tbprofiler/snp_dists.py index 3ec8f83..9c529fe 100644 --- a/tbprofiler/snp_dists.py +++ b/tbprofiler/snp_dists.py @@ -12,6 +12,7 @@ import argparse from .models import ProfileResult, LinkedSample from typing import List, Tuple +from datetime import datetime def extract_variant_set(vcf_file: str) -> Tuple[set,set]: ref_diffs = set() @@ -74,6 +75,7 @@ def store(self,result: ProfileResult, vcf_file: str) -> None: self.missing = missing def search(self,result: ProfileResult, vcf_file: str, cutoff: int = 20) -> List[LinkedSample]: logging.info("Searching for close samples in %s" % self.filename) + start_timestamp = datetime.now() self.c.execute("SELECT sample, diffs, missing FROM variants WHERE lineage=?",(result.sub_lineage,)) self.diffs,self.missing = extract_variant_set(vcf_file) sample_dists = [] @@ -89,6 +91,8 @@ def search(self,result: ProfileResult, vcf_file: str, cutoff: int = 20) -> List[ positions = list(dist) ) ) + end_timestamp = datetime.now() + logging.info("Finished searching for close samples in %s. Took %s" % (self.filename,end_timestamp-start_timestamp)) logging.info("Found %s close samples" % len(sample_dists)) return sample_dists