From e0f38faae42309599f3e294900183ca1071a30f3 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:47:30 -0400 Subject: [PATCH 01/40] Bump conda version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5aa14bc..2bfe514 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="elgato", - version="1.11.1", + version="1.12.0", python_requires='>=3.8', scripts = [ 'el_gato/el_gato.py', From 38ef1883b629840673182a59f19e283ae069165d Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:48:00 -0400 Subject: [PATCH 02/40] Output files show SBT order --- el_gato/el_gato.py | 105 ++++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 34 deletions(-) diff --git a/el_gato/el_gato.py b/el_gato/el_gato.py index 45bdf6c..ddfa1d2 100644 --- a/el_gato/el_gato.py +++ b/el_gato/el_gato.py @@ -11,7 +11,7 @@ import shlex import time import math -from collections import defaultdict, Counter +from collections import defaultdict, Counter, OrderedDict t0 = time.time() script_filename = inspect.getframeinfo(inspect.currentframe()).filename script_path = os.path.dirname(os.path.abspath(script_filename)) @@ -32,18 +32,30 @@ class Ref: mompS_primer2 = "TTGACCATGAGTGGGATTGG\tCAGAAGCTGCGAAATCAG" prereq_programs = ["minimap2", "samtools", "makeblastdb", "blastn", "isPcr"] REF_POSITIONS = { - "asd": { + "flaA": { 'start_pos' : 351, - 'end_pos' : 823, + 'end_pos' : 532, }, - "flaA": { + "pilE": { 'start_pos' : 351, - 'end_pos' : 532, + 'end_pos' : 683, + }, + "asd": { + 'start_pos' : 351, + 'end_pos' : 823, }, "mip": { 'start_pos' : 350, 'end_pos' : 751, }, + "mompS": { + 'start_pos' : 367, + 'end_pos' : 718, + }, + "proA": { + 'start_pos' : 350, + 'end_pos' : 754, + }, "neuA": { 'start_pos' : 350, 'end_pos' : 703, @@ -64,18 +76,6 @@ class Ref: 'start_pos' : 350, 'end_pos' : 700, }, - "pilE": { - 'start_pos' : 351, - 'end_pos' : 683, - }, - "proA": { - 'start_pos' : 350, - 'end_pos' : 754, - }, - "mompS": { - 'start_pos' : 367, - 'end_pos' : 718, - }, } class SAM_data(object): @@ -197,7 +197,7 @@ def get_args() -> argparse.ArgumentParser: group2.add_argument("--help", "-h", action="help", help="Show this help message and exit") group2.add_argument("--threads", "-t", help="Number of threads to run the programs (default: %(default)s)", type=int, required=False, default=1) - group2.add_argument("--mincov", "-c", help="Specify the minimum coverage used to identify loci in paired-end reads (default: %(default)s)", type=int, required=False, + group2.add_argument("--depth", "-d", help="Specify the minimum depth used to identify loci in paired-end reads (default: %(default)s)", type=int, required=False, default=10) group2.add_argument("--out", "-o", help="Output folder name (default: %(default)s)", type=str, required=False, default="out") @@ -385,7 +385,7 @@ def set_inputs( inputs["profile"] = args.profile inputs["verbose"] = args.verbose inputs["overwrite"] = args.overwrite - inputs["mincov"] = args.mincov + inputs["depth"] = args.depth inputs["header"] = args.header Ref.file = os.path.join(inputs["out_prefix"], Ref.file) if args.sample == inputs["sample_name"]: @@ -643,6 +643,8 @@ def run_command(command: str, """ logging.debug(f"Running command: {command}") full_command = command + + if tool is not None: logging.info(f"Running {tool}") if not shell: @@ -661,6 +663,22 @@ def run_command(command: str, logging.critical(f"CRITICAL ERROR! The following command had an improper exit: \n{full_command}\n") sys.exit(1) +# Sort BLAST output in SBT order + if tool == "blast": + loci = ['flaA', 'pilE', 'asd', 'mip', 'mompS', 'proA', 'neuA_neuAH'] + al = [] + res_string = '' + for line in result.splitlines(): + line = line.split() + al.append(line) + res = [loc for x in loci for loc in al if x in loc[1]] + res = [" ".join(i) for i in res] + for i in res: + res_string += str(i) + res_string += '\n' + res_string = res_string.replace(" ", "\t") + result = res_string + if log_output: pretty_result = prettify("\n".join([column_headers, result])) if tool is not None: @@ -750,6 +768,7 @@ def blast_momps_allele(inputs: dict, seq: str, db: str) -> list: a.allele_id = "-" return [a] + def call_momps_pcr(inputs: dict, assembly_file: str) -> list: @@ -882,8 +901,10 @@ def blast_remaining_loci(inputs: dict, assembly_file: str, ref: Ref, momps: bool """ loci = ["flaA", "pilE", "asd", "mip", "proA", "neuA_neuAH"] if momps: - loci.append("mompS") + loci.insert(4, "mompS") calls = {k:[] for k in loci} + al = [] + res_string = '' assembly_dict = fasta_to_dict(assembly_file) @@ -891,10 +912,21 @@ def blast_remaining_loci(inputs: dict, assembly_file: str, ref: Ref, momps: bool desc_header = "Best match of each locus in provided assembly using BLASTN." column_headers = "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tqlen\tslen" result = run_command(blast_command, tool='blast', shell=True, log_output=False) - + result = filter_blast_hits(result, momps=momps, len_thresh=inputs['length'], pcnt_id_thresh=inputs['sequence']) # Now do the logging of the good blast hits + + for line in result.splitlines(): + line = line.split() + al.append(line) + res = [loc for x in loci for loc in al if x in loc[1]] + res = [" ".join(i) for i in res] + for i in res: + res_string += str(i) + res_string += '\n' + res_string = res_string.replace(" ", "\t") + result = res_string pretty_result = prettify("\n".join([column_headers, result])) logging.debug(f"Command log for blast:\n{pretty_result}") @@ -918,7 +950,7 @@ def blast_remaining_loci(inputs: dict, assembly_file: str, ref: Ref, momps: bool if float(bits[2]) == 100.00 and bits[3] == bits[13]: a.allele_id = bits[1].split("_")[-1] elif bits[3] == bits[13]: - a.allele_id = bits[1].split("_")[-1]+"*" + a.allele_id = "NAT" else: error_msg = f"The sequence of locus {locus} did not return a full length match in the database\n" logging.info(error_msg) @@ -950,6 +982,7 @@ def blast_remaining_loci(inputs: dict, assembly_file: str, ref: Ref, momps: bool calls[locus].append(a) not_found_loci = [k for k,v in calls.items() if v ==[]] + if len(not_found_loci) != 0: error_msg = f"The following loci were not found in your assembly: {', '.join(not_found_loci)}\n" @@ -967,8 +1000,7 @@ def blast_remaining_loci(inputs: dict, assembly_file: str, ref: Ref, momps: bool logging.info(error_msg) with open(f"{inputs['out_prefix']}/intermediate_outputs.txt", 'a') as f: f.write(error_msg + '\n') - - + return calls def get_st(allele_profile: str, Ref: Ref, profile_file: str) -> str: @@ -1113,6 +1145,7 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str alleles = {} cov_results = {} + loci = ['flaA', 'pilE', 'asd', 'mip', 'mompS', 'proA', 'neuA'] for line in result.strip().split('\n')[1:]: gene, _, _, cov, depth, _, _ = line.split() @@ -1139,6 +1172,7 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str a = Allele() a.allele_id = '-' alleles['neuA_neuAH'] = [a] + neuA_catch = '' if len([i for i in ref.REF_POSITIONS.keys() if 'neuA' in i]) > 1: cov_sorted = sorted( @@ -1153,8 +1187,8 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str del ref.REF_POSITIONS[gene[0]] else: logging.info("Analysis of read mapping to neuA locus variants was unsuccessful. Unclear which to use.") - - + + #ref.REF_POSITIONS = OrderedDict([(x, ref.REF_POSITIONS[x]) for loci in x]) for locus in ref.REF_POSITIONS: locus_reads = contig_dict[locus] @@ -1208,8 +1242,8 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str ) min_cov = min(cov) - if min_cov < inputs['mincov']: - msg = f"WARNING: After applying a quality cutoff of 20 to basecalls, at least one position in {locus.split('_')[0]} has below {inputs['mincov']} coverage and can't be resolved" + if min_cov < inputs['depth']: + msg = f"WARNING: After applying a quality cutoff of 20 to basecalls, at least one position in {locus.split('_')[0]} has below {inputs['depth']} depth and can't be resolved" logging.info(msg) cov_msg += f"\n{msg}\n\n" a = Allele() @@ -1217,8 +1251,8 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str alleles[locus] = [a] continue - cov_msg += f"minimum coverage of {locus} locus is {min_cov}." + '\n' - logging.info(f"minimum coverage of {locus} locus is {min_cov}.") + cov_msg += f"minimum depth of {locus} locus is {min_cov}." + '\n' + logging.info(f"minimum depth of {locus} locus is {min_cov}.") num_alleles_per_site = [len(i) for i in seq] @@ -1318,8 +1352,8 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str else: allele.seq += allele.basecalls[biallic_count] if len(base) > 1: - biallic_count+=1 - + biallic_count+=1 + with open(f"{outdir}/intermediate_outputs.txt", 'a') as f: f.write(cov_msg + "\n\n") @@ -1348,6 +1382,8 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str def write_alleles_to_file(alleles: list, outdir: str): identified_allele_fasta_string = "" + loci = ['flaA', 'pilE', 'asd', 'mip', 'mompS', 'proA', 'neuA_neuAH'] + alleles = OrderedDict([(a, alleles[a]) for a in loci]) for locus, l in alleles.items(): if len(l) > 1: for n, allele in enumerate(l): @@ -1406,6 +1442,7 @@ def map_alleles(inputs: dict, ref: Ref): column_headers = "qseqid\tsseqid\tpident\tlength\tmismatch\tgapopen\tqstart\tqend\tsstart\tsend\tevalue\tbitscore\tqlen\tslen" result = run_command(blast_command, tool='blast', shell=True, desc_file=f"{outdir}/intermediate_outputs.txt", desc_header=desc_header, column_headers=column_headers) + if len(result) == 0: logging.info("WARNING: No allele matches found in the database. Can't resolve any alleles!") with open(f"{outdir}/intermediate_outputs.txt", "a") as fout: @@ -1423,7 +1460,7 @@ def map_alleles(inputs: dict, ref: Ref): for allele_list in alleles.values(): for allele in allele_list: if bits[0] == allele.fasta_header: - allele.allele_id = bits[1].split("_")[-1]+"*" + allele.allele_id = "NAT" if len(alleles['mompS']) == 1: logging.info("1 mompS allele identified.") @@ -1674,7 +1711,7 @@ def main(): 'profile' : os.path.join(os.path.dirname(__file__), "db", "lpneumophila.txt"), 'verbose' : False, 'overwrite' : False, - 'depth' : 3, + 'depth' : 10, 'analysis_path' : "", 'logging_buffer_message' : "", 'header' : True, From a17cc254e66c9a236e6dce57ea44f0286b1ded20 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:48:22 -0400 Subject: [PATCH 03/40] Bump conda version --- conda-recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 8daf49f..3ba78e1 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "elgato" %} -{% set version = "1.11.1" %} +{% set version = "1.12.0" %} package: name: {{ name|lower }} From c639daf35a4117e5d2aa1039932758e454d41384 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:51:31 -0400 Subject: [PATCH 04/40] Update depth flag --- run_el_gato.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/run_el_gato.nf b/run_el_gato.nf index 0dce008..7c24e15 100644 --- a/run_el_gato.nf +++ b/run_el_gato.nf @@ -4,7 +4,7 @@ nextflow.enable.dsl=2 params.reads_dir = false params.assembly_dir = false params.threads = 1 -params.mincov = 10 +params.depth = 10 params.length = 0.3 params.sequence = 95.0 params.out = 'el_gato_out' @@ -35,7 +35,7 @@ process RUN_EL_GATO_READS { -2 $r2 \ -o ${sampleId}_out \ -t ${task.cpus} \ - -c $params.mincov \ + -d $params.depth \ -w > mlst.txt mv mlst.txt ${sampleId}_out/ From 2103ea311f826b2c068fc56ea1dd370afe9335de Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:33:39 -0400 Subject: [PATCH 05/40] Update README.md --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 12d24d9..fc7d37e 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # el_gato **E**pidemiology of ***L**egionella* : **G**enome-b**A**sed **T**yping: -[Add two sentence summary of what el gato is] +El_gato is a Bioinformatics tool that utilizes either a genome assembly (.fasta) or Illumina paired-end reads (.fastq) to replicate *Legionella pneumophila* Sequence Based Typing (SBT). From the input, 7 loci (*flaA*, *pilE*, *asd*, *mip*, *mompS*, *proA*, *neuA/neuAh*) are identified and compared to a database of sequence types. The sequence type provided for each input sample is based on the unique combination of the allelic identities of the 7 target loci. * [Installation](#installation) * [Method 1: using conda](#method-1-using-conda) @@ -96,7 +96,7 @@ Optional arguments: --threads THREADS -t THREADS Number of threads to run the programs (default: 1) --depth DEPTH -d DEPTH - Variant read depth cutoff (default: 3) + Variant read depth cutoff (default: 10) --out OUT -o OUT Output folder name (default: out) --sample SAMPLE -n SAMPLE @@ -155,9 +155,13 @@ The corresponding allele number is reported for each gene if an exact allele mat | Symbol | Meaning | |:------:|:---------| +|Novel ST | Novel Sequence Type: All 7 target genes were found, but not present in the profile - most likely a novel sequence type. | +|Novel ST* | Novel Sequence Type due to novel allele: One or multiple target genes have a novel allele found. | +|MD- | Missing Data: ST is unidentifiable as a result of or more of the target genes that are unidentifiable. | +|MA? | Multiple Alleles: ST is ambiguous due to multiple alleles that could not be resolved. | | NAT | Novel Allele Type: BLAST cannot find an exact allele match - most likely a new allele. | | - | Missing Data: Both percent and length identities are too low to return a match or N's in sequence. | -| ? | Multiple alleles: More than one allele was found and could not be resolved. | +| ? | Multiple Alleles: More than one allele was found and could not be resolved. | If symbols are present in the ST profile, the other output files produced by el_gato will provide additional information to understand what is being communicated. @@ -179,7 +183,7 @@ Headers are included in outputs for the samtools coverage command and blast resu |:-------------:|:---------------------------------------------------:| | rname | Locus name | | numreads | Number reads aligned to the region (after filtering)| -| covbases | Number of covered bases with depth >= 1 | +| covbases | Number of covered bases with depth >= 10 | | coverage | Percentage of covered bases [0..100] | | meandepth | Mean depth of coverage | | meanbaseq | Mean baseQ in covered region | @@ -264,7 +268,7 @@ The sequence of the two copies of *mompS* and the identity of the correct allele a. Only one allele has associated reads with primer *mompS*-1116R correctly oriented. - b. One allele has more than three times [still valid?] as many reads with correctly oriented primer as the other. + b. One allele has more than three times as many reads with correctly oriented primer as the other. c. One allele has no associated reads with the primer *mompS*-1116R in either orientation, but the other allele has associated reads with the primer in only the wrong orientation. In this case, the allele with no associated reads with the primer in either orientation is considered the primary locus by the process of elimination. @@ -288,4 +292,4 @@ nextflow run_el_gato.nf --reads_dir --threads --threads --out -profile singularity -c nextflow.config ``` -**Note:** To run nextflow without the singularity container, uncomment conda environment installation on line 10 and line 47 of the run_el_gato.nf file. \ No newline at end of file +**Note:** To run nextflow without the singularity container, uncomment conda environment installation on line 10 and line 47 of the run_el_gato.nf file. From 62bd9b664b0d1f8f6e327e10a6d55ca6b12d901f Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:34:06 -0400 Subject: [PATCH 06/40] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fc7d37e..dc71810 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # el_gato **E**pidemiology of ***L**egionella* : **G**enome-b**A**sed **T**yping: -El_gato is a Bioinformatics tool that utilizes either a genome assembly (.fasta) or Illumina paired-end reads (.fastq) to replicate *Legionella pneumophila* Sequence Based Typing (SBT). From the input, 7 loci (*flaA*, *pilE*, *asd*, *mip*, *mompS*, *proA*, *neuA/neuAh*) are identified and compared to a database of sequence types. The sequence type provided for each input sample is based on the unique combination of the allelic identities of the 7 target loci. +El_gato is a bioinformatics tool that utilizes either a genome assembly (.fasta) or Illumina paired-end reads (.fastq) to replicate *Legionella pneumophila* Sequence Based Typing (SBT). From the input, 7 loci (*flaA*, *pilE*, *asd*, *mip*, *mompS*, *proA*, *neuA/neuAh*) are identified and compared to a database of sequence types. The sequence type provided for each input sample is based on the unique combination of the allelic identities of the 7 target loci. * [Installation](#installation) * [Method 1: using conda](#method-1-using-conda) From 21905c968c64a4a42e5577e192c0625a2bfffc3b Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:42:30 -0400 Subject: [PATCH 07/40] Update README.md --- README.md | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index dc71810..a058d1e 100755 --- a/README.md +++ b/README.md @@ -151,8 +151,6 @@ The ST column can contain two kinds of values. If the identified ST corresponds The corresponding allele number is reported for each gene if an exact allele match is found in the database. Alternatively, el_gato may also note the following symbols: -[Table needs to be updated, but output does not match table in ppt] - | Symbol | Meaning | |:------:|:---------| |Novel ST | Novel Sequence Type: All 7 target genes were found, but not present in the profile - most likely a novel sequence type. | @@ -292,4 +290,13 @@ nextflow run_el_gato.nf --reads_dir --threads --threads --out -profile singularity -c nextflow.config ``` -**Note:** To run nextflow without the singularity container, uncomment conda environment installation on line 10 and line 47 of the run_el_gato.nf file. +**Note:** To run nextflow without the singularity container, uncomment conda environment installation on line 10 and line 47 of the run_el_gato.nf file and use the following commands: + +``` +# Reads +nextflow run_el_gato.nf --reads_dir --threads --out + +# Assemblies +nextflow run_el_gato.nf --assembly_dir --threads --out + +``` From 79df2c74fba0744d247ac8e315f2b10200be2c32 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:43:01 -0400 Subject: [PATCH 08/40] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index a058d1e..ec82f81 100755 --- a/README.md +++ b/README.md @@ -298,5 +298,4 @@ nextflow run_el_gato.nf --reads_dir --threads --threads --out - ``` From 527fec0814ce34081692c04d9d5d0df331d33fd3 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Fri, 8 Sep 2023 14:45:05 -0400 Subject: [PATCH 09/40] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ec82f81..0b92dca 100755 --- a/README.md +++ b/README.md @@ -299,3 +299,5 @@ nextflow run_el_gato.nf --reads_dir --threads --threads --out ``` +## Output files for Nextflow +At the completion of a run, the specified output directory (default: el_gato_out/) will contain a file named "all_mlst.txt" (the MLST profile of each sample) and one directory for each sample processed. Each sub-directory is named with a sample name and contains output files specific to that sample. These files include the el_gato log file and files providing more details about the sequences identified in the sample. From a21626f1dca17efd941ced4d193bbef2f2b35d62 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Mon, 18 Sep 2023 11:44:31 -0400 Subject: [PATCH 10/40] Update nextflow script --- run_el_gato.nf | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/run_el_gato.nf b/run_el_gato.nf index 7c24e15..90da9ac 100644 --- a/run_el_gato.nf +++ b/run_el_gato.nf @@ -98,6 +98,26 @@ process CAT { """ } +process FINAL_JSON { + publishDir params.out, mode: 'copy', overwrite: true + input: + path files + + output: + path 'report.json' + + """ + echo "[" > report.tmp + cat \$(ls *.json | head -1) >> report.tmp + ls *.json | tail -n+2 | while read jfile; do + echo "," >> report.tmp; + cat \$jfile >> report.tmp; + done + echo "]" >> report.tmp + mv report.tmp report.json + """ +} + workflow { if (params.reads_dir) { @@ -105,6 +125,7 @@ workflow { files = RUN_EL_GATO_READS(readPairs).collect() CAT(files) + FINAL_JSON(files) } else { @@ -114,6 +135,7 @@ workflow { files = RUN_EL_GATO_ASSEMBLIES(assemblies).collect() CAT(files) + FINAL_JSON(files) } else { print "Please provide the path to a directory containing paired reads using --reads_dir or the path to a directory containing assemblies using --assembly_dir." From 2423b26f49403578a15bb93825b251b4e442ff00 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Mon, 18 Sep 2023 11:44:58 -0400 Subject: [PATCH 11/40] Bump conda version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 2bfe514..7c65af2 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="elgato", - version="1.12.0", + version="1.14.0", python_requires='>=3.8', scripts = [ 'el_gato/el_gato.py', From cbc6c372ea8bc6fca30c1878a9fd8325fdff6bab Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Mon, 18 Sep 2023 11:45:41 -0400 Subject: [PATCH 12/40] Added report json output --- el_gato/el_gato.py | 111 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 100 insertions(+), 11 deletions(-) diff --git a/el_gato/el_gato.py b/el_gato/el_gato.py index ddfa1d2..194195c 100644 --- a/el_gato/el_gato.py +++ b/el_gato/el_gato.py @@ -11,6 +11,7 @@ import shlex import time import math +import json from collections import defaultdict, Counter, OrderedDict t0 = time.time() script_filename = inspect.getframeinfo(inspect.currentframe()).filename @@ -215,7 +216,8 @@ def get_args() -> argparse.ArgumentParser: action="store_true", required=False, default=False) group2.add_argument("--header", "-e", help="Include column headers in the output table (default: %(default)s)", action="store_true", required=False, default=False), group2.add_argument("--length", "-l", help="Specify the BLAST hit length threshold for identifying multiple loci in assembly (default: %(default)s)", type = float, required=False, default=0.3), - group2.add_argument("--sequence", "-q", help="Specify the BLAST hit percent identity threshold for identifying multiple loci in assembly (default: %(default)s)", type = float, required=False, default=95.0), + group2.add_argument("--sequence", "-q", help="Specify the BLAST hit percent identity threshold for identifying multiple loci in assembly (default: %(default)s)", type = float, required=False, default=95.0) + group2.add_argument("--samfile", "-m", help="Specify whether or not the SAM file is included in the output directory (default: %(default)s)", action="store_true", required=False, default=False), return parser @@ -373,10 +375,13 @@ def set_inputs( if "r" in inputs["analysis_path"]: inputs["read1"] = args.read1 inputs["read2"] = args.read2 + inputs["samfile"] = args.samfile + inputs["json_out"]['operation_mode'] = "Reads" if "a" in inputs["analysis_path"]: inputs["assembly"] = args.assembly inputs["length"] = args.length inputs["sequence"] = args.sequence + inputs["json_out"]['operation_mode'] = "Assembly" inputs["threads"] = args.threads inputs["out_prefix"] = args.out inputs["log"] = os.path.join(args.out, "run.log") @@ -904,6 +909,11 @@ def blast_remaining_loci(inputs: dict, assembly_file: str, ref: Ref, momps: bool loci.insert(4, "mompS") calls = {k:[] for k in loci} al = [] + j_list = [] + blast_list = [] + blast_dict = {} + b_dict = {} + loc_dict = {} res_string = '' assembly_dict = fasta_to_dict(assembly_file) @@ -914,6 +924,7 @@ def blast_remaining_loci(inputs: dict, assembly_file: str, ref: Ref, momps: bool result = run_command(blast_command, tool='blast', shell=True, log_output=False) result = filter_blast_hits(result, momps=momps, len_thresh=inputs['length'], pcnt_id_thresh=inputs['sequence']) + # Now do the logging of the good blast hits @@ -939,7 +950,27 @@ def blast_remaining_loci(inputs: dict, assembly_file: str, ref: Ref, momps: bool f.write(desc_header + '\n\n') f.write(f"Command:\n\n{blast_command}\n\n") f.write(f"Output:\n\n{pretty_result}\n\n") - + + # Write BLAST results to json + + for line in result.splitlines(): + line = line.split() + j_list.append(line) + for j in j_list: + b_list = [] + b_list.append(str(j[1])) + b_list.append(str(j[0])) + b_list.append(str(j[6])) + b_list.append(str(j[7])) + for l in loci: + if l in b_list[0]: + if l not in b_dict.keys(): + b_dict[l] = [b_list] + else: + b_dict[l].append(b_list) + + blast_dict = {"BLAST_hit_locations": b_dict} + inputs["json_out"]['mode_specific'] = blast_dict for line in result.strip().split('\n'): bits = line.split() @@ -1124,7 +1155,6 @@ def assess_allele_conf(bialleles, reads_at_locs, allele_idxs, read_info_dict, re allele.confidence['against'] += 1 break - return alleles_info @@ -1151,7 +1181,7 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str gene, _, _, cov, depth, _, _ = line.split() cov = float(cov) depth = float(depth) - cov_results[gene] = {'cov': cov, 'depth': depth} + cov_results[gene] = {"Proportion_covered": str(cov), "Minimum_coverage": str(depth)} if cov != 100.: if 'neuA' in gene: if cov < 99: @@ -1176,7 +1206,7 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str if len([i for i in ref.REF_POSITIONS.keys() if 'neuA' in i]) > 1: cov_sorted = sorted( - [(k,v['depth']) for k,v in cov_results.items()], + [(k,v['Minimum_coverage']) for k,v in cov_results.items()], key=lambda x: x[1], reverse=True) # Try to use depth to determine the right one to use @@ -1352,7 +1382,7 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str else: allele.seq += allele.basecalls[biallic_count] if len(base) > 1: - biallic_count+=1 + biallic_count+=1 with open(f"{outdir}/intermediate_outputs.txt", 'a') as f: f.write(cov_msg + "\n\n") @@ -1376,10 +1406,14 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str elif len(neuAs) == 1: alleles['neuA_neuAH'] = alleles[neuAs[0]] del alleles[neuAs[0]] + + cov_dict = {} + cov_dict['locus_coverage'] = cov_results + + inputs["json_out"]['mode_specific'] = cov_dict return alleles - def write_alleles_to_file(alleles: list, outdir: str): identified_allele_fasta_string = "" loci = ['flaA', 'pilE', 'asd', 'mip', 'mompS', 'proA', 'neuA_neuAH'] @@ -1425,6 +1459,7 @@ def map_alleles(inputs: dict, ref: Ref): db = inputs['sbt'] sample_name = inputs['sample_name'] profile = inputs['profile'] + samfile = inputs['samfile'] # Run BWA mem logging.info("Mapping reads to reference sequence, then filtering unmapped reads from sam file") @@ -1531,6 +1566,7 @@ def map_alleles(inputs: dict, ref: Ref): alleles['mompS'] = [a for a in alleles['mompS'] if a.confidence['for'] > 0] temp_alleles = alleles.copy() temp_alleles['mompS'] = old_mompS + write_possible_mlsts(inputs=inputs, alleles=temp_alleles, header=True, confidence=True) for locus in [i for i in alleles if i != 'mompS']: @@ -1552,6 +1588,12 @@ def map_alleles(inputs: dict, ref: Ref): with open(f"{outdir}/intermediate_outputs.txt", "a") as fout: fout.write(message) + if samfile == True: + pass + else: + os.remove(f"{outdir}/reads_vs_all_ref_filt.sam") + + return alleles @@ -1578,6 +1620,23 @@ def write_possible_mlsts(inputs: dict, alleles: dict, header: bool, confidence: possible_mlsts += (inputs['sample_name'] + "\t" + get_st(allele_profile, Ref, profile_file=inputs["profile"]) + "\t" + allele_profile + f"\t{mompS.confidence['for']}\t{mompS.confidence['against']}\n") else: possible_mlsts += (inputs['sample_name'] + "\t" + get_st(allele_profile, Ref, profile_file=inputs["profile"]) + "\t" + allele_profile + "\n") + + al_list = [] + primer_list = [] + + if inputs["json_out"]['operation_mode'] == "Reads": + for line in possible_mlsts.splitlines(): + line = line.split("\t") + al_list.append(line) + for a in al_list[1:]: + pl = [] + pl.append("mompS_"+str(a[6])) + pl.append(str(a[9])) + pl.append(str(a[10])) + primer_list.append(pl) + inputs["json_out"]['mode_specific']['mompS_primers'] = primer_list + else: + pass with open(f"{inputs['out_prefix']}/possible_mlsts.txt", 'w') as f: f.write(possible_mlsts) @@ -1642,7 +1701,8 @@ def print_table(inputs: dict, Ref: Ref, alleles: dict) -> str: str formatted ST + allele profile (and optional header) of the isolate """ - + loci = ['st', 'flaA', 'pilE', 'asd', 'mip', 'mompS', 'proA', 'neuA_neuAH'] + i = 0 outlines = [] if inputs['header']: header = "Sample\tST\t" + "\t".join(Ref.locus_order) @@ -1663,7 +1723,31 @@ def print_table(inputs: dict, Ref: Ref, alleles: dict) -> str: + "\t" + allele_profile) outlines.append(allele_profile) - + + inputs["json_out"]['id'] = inputs["sample_name"] + if inputs["json_out"]['operation_mode'] == "Assembly": + inputs["json_out"]['mode_specific']['length_id'] = str(inputs["length"]) + inputs["json_out"]['mode_specific']['sequence_id'] = str(inputs["sequence"]) + else: + pass + allele_dict = {} + + + out_string = "\n".join(outlines) + for line in out_string.splitlines(): + line = line.split("\t") + while i < len(loci): + allele_dict[loci[i]] = line[i+1] + i+=1 + inputs["json_out"]['mlst'] = allele_dict + + if inputs["json_out"]['operation_mode'] == "Reads": + if allele_dict['mompS'] == "?" or allele_dict['mompS'] == "NAT" or allele_dict['mompS'] == "-": + inputs["json_out"]['mode_specific']['mompS_primer_conclusion'] = "inconclusive" + else: + inputs["json_out"]['mode_specific']['mompS_primer_conclusion'] = "mompS_"+allele_dict['mompS'] + else: + pass return "\n".join(outlines) @@ -1716,7 +1800,9 @@ def main(): 'logging_buffer_message' : "", 'header' : True, 'length' : 0.3, - 'sequence' : 95.0 + 'sequence' : 95.0, + 'samfile' : False, + 'json_out' : {} } @@ -1745,11 +1831,14 @@ def main(): get_inputs(inputs) logging.info("Starting analysis") output = choose_analysis_path(inputs, Ref) + json_out = inputs["json_out"] + json_dump = json.dumps(json_out, indent=2) + with open(os.path.join(inputs["out_prefix"], "report.json"), "w") as j_out: + j_out.write(json_dump) logging.info("Finished analysis") logging.debug(f"Output = \n{output}\n") print(output) - total_time = pretty_time_delta(int(time.time() - t0)) logging.info(f"The program took {total_time}") From 232c915da3e489776c85663c2f2f6b5d359686aa Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Mon, 18 Sep 2023 11:46:11 -0400 Subject: [PATCH 13/40] bump conda version --- conda-recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 3ba78e1..14ee284 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "elgato" %} -{% set version = "1.12.0" %} +{% set version = "1.14.0" %} package: name: {{ name|lower }} From 2c1e9e289865151421f5e39be45103719db1c44c Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 11:03:06 -0400 Subject: [PATCH 14/40] Reporting module for el_gato --- elgato_report.py | 438 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 438 insertions(+) create mode 100644 elgato_report.py diff --git a/elgato_report.py b/elgato_report.py new file mode 100644 index 0000000..6b24309 --- /dev/null +++ b/elgato_report.py @@ -0,0 +1,438 @@ +#!/usr/bin/env python3 + +import sys +import json +import math +from dataclasses import dataclass +from datetime import datetime +from fpdf import FPDF +from ctypes import alignment + +LOGO="""\ + /\\_/\\ +( o.o ) + > ^ <\ +""" + +summary_header = """\ +El_gato utilizes either a genome assembly (.fasta) or Illumina paired-end reads (.fastq) to replicate \ +Legionella pneumophila Sequence Based Typing (SBT). From the input, 7 loci (flaA, pilE, asd, mip, \ +mompS, proA, and neuA/neuAh) are identified and compared to a database of subtypes. The sequence \ +type provided for each input sample is based on the unique combination of the allelic identities of the 7 \ +target loci. It is possible that multiple alleles for a specific locus were identified and unable to be resolved properly. \ +This would lead to a ST that indicates that one or more loci contain multiple alleles.\ +""" + +reads_header = """\ +The following sample was analyzed using the paired-end reads functionality. The tables below show the full \ +MLST profile of the sample, the coverage data for each locus, and information regarding the primers used to \ +identify the primary mompS allele. Highlighted rows illustrate the data that caused failure of allele identification.\ +""" + +assembly_header = """\ +The following sample was analyzed using the assembly functionality. The tables below show the full \ +MLST profile of the sample and the corresponding locus location information. Unless specified by the user, \ +el_gato utilizes a default 30% (0.3) BLAST hit length threshold and a 95% (95.0) sequence identity threshold \ +to identify multiple alleles on multiple contigs ('?'). Highlighted rows illustrate the data that caused \ +failure of allele identification.\ +""" + +cdc_header = """\ +National Center for Immunization and Respiratory \n +Diseases (NCIRD) \n +Division of Bacterial Diseases (DBD) / \n +Respiratory Diseases Branch (RDB) \n +Pneumonia Response and Surveillance Laboratory (PRSL)\ +""" +abbrev_key = """\ +Novel ST = The alleles for all 7 loci were identified, however their unique combination and corresponding ST has not been found in the database. \n +Novel ST* = One or more locus failed to amplify, which may indicate a novel allele. \n +MA? = One or more locus contain multiple alleles that could not be resolved, leading to an ambiguous ST. \n +MD- = One or more locus was unidentifiable, leading to an unidentifiable ST. \n +'-' = There is missing data for the locus and could not be identified. \n +'NAT' = Amplification of the locus was unsuccessful, possibly due to the presence of a novel allele type. \n +'?' = The locus contains multiple alleles that could not be resolved.\ +""" + +primer_footer = """\ +The primary mompS allele is identified using the following criteria: \n +1. Only one sequence has associated reads with the correctly oriented primers. \n +2. One sequence has more than 3 times as many reads with the correctly oriented primer as the other. \n +3. One sequence has no associated reads with the primer in either orientation, but the other has reads with the primer only in the wrong direction. The sequence with no associated reads is considered the primary locus in this case.\ +""" +@dataclass +class Report(FPDF): + sample_id: str + st: str + flaA: str + pilE: str + asd: str + mip: str + mompS: str + proA: str + neuA_neuAH: str + mode: str + mode_specific: dict + + @classmethod + def from_json(cls, json_data): + sample_id = json_data["id"] + st = json_data["mlst"]["st"] + flaA = json_data["mlst"]["flaA"] + pilE = json_data["mlst"]["pilE"] + asd = json_data["mlst"]["asd"] + mip = json_data["mlst"]["mip"] + mompS = json_data["mlst"]["mompS"] + proA = json_data["mlst"]["proA"] + neuA_neuAH = json_data["mlst"]["neuA_neuAH"] + mode = json_data["operation_mode"] + mode_specific = json_data["mode_specific"] + x = cls( + sample_id, + st, + flaA, + pilE, + asd, + mip, + mompS, + proA, + neuA_neuAH, + mode, + mode_specific, + ) + return x + + def list_mlst(self): + return [ + self.sample_id, + self.st, + self.flaA, + self.pilE, + self.asd, + self.mip, + self.mompS, + self.proA, + self.neuA_neuAH + ] + + def sample_report( + self, + pdf, + typeface='Courier', + body_style='', + body_size=11, + head_style='B', + head_size=16 + ): + + pdf.add_page() + pdf.set_font(typeface, head_style, head_size) + + if self.mode == "Assembly": + pdf = self.assembly_report(pdf, typeface, body_style, body_size) + elif self.mode == "Reads": + pdf = self.reads_report(pdf, typeface, body_style, body_size) + else: + sys.exit( + f"Unsupported operation mode identified for sample {self.sample_id}" + ) + return pdf + + + def reads_report(self, pdf, typeface, style, size): + pdf.set_font(typeface, style, size) + pdf.set_font('Courier', 'B', 10) + pdf.cell(80) + pdf.multi_cell(h=4,w=0,txt="Epidemiology of Legionella: Genome-based Typing (el_gato) Paired-End Reads Report", align="C") + pdf.ln(10) + pdf.set_font('Courier', '', 11) + pdf.multi_cell( + w=0,h=5, + txt=reads_header, + new_x="LMARGIN", new_y="NEXT" + ) + pdf.ln(10) + + pdf = self.make_mlst_table(pdf, [self.list_mlst()]) + pdf.ln(10) + + pdf.set_font(style="BU") + pdf.cell( + w=0,h=10, + txt=f"Locus Coverage Information", + new_x="LMARGIN", new_y="NEXT", align="C" + ) + + pdf.set_font() + pdf = self.read_coverage_table(pdf) + + pdf.add_page() + pdf.ln(10) + pdf.set_font(style="BU") + pdf.cell( + w=0,h=10, + txt=f"mompS Primer Information", + new_x="LMARGIN", new_y="NEXT", align="C" + ) + pdf.set_font() + pdf = self.mompS_primer_table(pdf) + pdf.ln(0) + + pdf.multi_cell( + w=0, h=3.5, + txt=primer_footer, + new_x="LMARGIN", new_y="NEXT" + ) + + return pdf + + + def read_coverage_table(self, pdf): + contents = [["Locus", "Proportion Covered", "Minimum Coverage"]] + contents += [ + [ + k, v["Proportion_covered"], v["Minimum_coverage"] + ] for k, v in self.mode_specific["locus_coverage"].items()] + col_widths = (50, 50, 50) + alignment = ("CENTER", "CENTER", "CENTER") + + highlight_rows = set() + for n, row in enumerate(contents[1:]): + if ( + (float(row[1]) < 100 and row[0] != "neuA_neuAH") + or (float(row[1]) < 99 and row[0] == "neuA_neuAH") + ): + highlight_rows.add(n+1) + pdf = self.make_table( + pdf, + contents, + col_widths=col_widths, + text_align=alignment, + highlight_rows=highlight_rows + ) + + return pdf + + def mompS_primer_table(self, pdf): + contents = [["Allele", "Reads Indicating Primary", "Reads Indicating Secondary"]] + contents += self.mode_specific["mompS_primers"] + col_widths = (50, 50, 50) + alignment = ("CENTER", "CENTER", "CENTER") + if self.mode_specific["mompS_primer_conclusion"] == "inconclusive": + highlight_rows = {1,2} + else: + highlight_rows = set() + + pdf = self.make_table( + pdf, + contents, + col_widths=col_widths, + text_align=alignment, + highlight_rows=highlight_rows + ) + pdf.ln(4) + + return pdf + + def assembly_report(self, pdf, typeface, style, size): + pdf.set_font(typeface, style, size) + pdf.cell(80) + pdf.set_font('Courier', 'B', 10) + pdf.multi_cell(h=4,w=0, txt="Epidemiology of Legionella: Genome-based Typing (el_gato) Assembly Results", align="C") + pdf.ln(10) + pdf.set_font('Courier', '', 11) + pdf.multi_cell( + w=0,h=5, + txt=assembly_header, + new_x="LMARGIN", new_y="NEXT" + ) + pdf.ln(10) + pdf = self.make_mlst_table(pdf, [self.list_mlst()]) + pdf.ln(4) + pdf.cell( + w=0,h=2, + txt="Length Identity and Sequence Identity: " + self.mode_specific['length_id'] +"; "+self.mode_specific['sequence_id'] + "%", + new_x="LMARGIN", new_y="NEXT" + ) + pdf.ln(10) + + pdf.set_font(style="BU") + pdf.cell( + w=0,h=10, + txt=f"Locus Location Information", + new_x="LMARGIN", new_y="NEXT", align="C" + ) + pdf.set_font() + pdf = self.locus_location_table(pdf) + + return pdf + + + def locus_location_table(self, pdf): + contents = [["locus", "allele", "contig", "start", "stop"]] + highlight_rows = set() + x = 1 + for k, v in self.mode_specific["BLAST_hit_locations"].items(): + contents.append([k] + v[0]) + if len(v) > 1: + for _ in range(len(v)): + highlight_rows.add(x) + x+=1 + for row in v[1:]: + contents.append([""] + row) + else: + x+=1 + col_widths = (25, 30, 70, 20, 20) + alignment = ("CENTER", "CENTER", "CENTER", "CENTER", "CENTER") + + content = [i for i in contents] + batches = self.fit_table(pdf, content, pdf.get_y(), 34) + pdf = self.make_table( + pdf, + batches[0], + col_widths=col_widths, + text_align=alignment, + highlight_rows=highlight_rows + ) + if len(batches) > 1: + for batch in batches[1:]: + pdf.add_page() + pdf.set_y(pdf.get_y() + 10) + pdf = Report.make_table( + pdf, + batch, + col_widths=col_widths, + text_align=alignment, + highlight_rows=highlight_rows + ) + + return pdf + + @staticmethod + def make_table(pdf, data, col_widths=None, text_align=None, highlight_rows=set()): + with pdf.table( + col_widths=col_widths, + text_align=text_align, + #borders_layout="MINIMAL" + ) as table: + for n, data_row in enumerate(data): + row = table.row() + if n in highlight_rows: + pdf.set_fill_color(233, 79, 88) + else: + pdf.set_fill_color(0, 0, 0) + for item in data_row: + row.cell(item) + return pdf + + @staticmethod + def fit_table(pdf, data, initial_y, characters:int): + font_size = pdf.font_size + pdf_y = initial_y + n = 0 + max_length = 0 + batches = [] + this_batch = [] + while n < len(data): + row = data[n] + for i in row: + column_length = len(i) + if column_length > max_length: + max_length = column_length + num_lines = math.ceil(max_length / characters) + cell_height = 2* num_lines * font_size + if pdf_y + cell_height > pdf.page_break_trigger: + batches.append(this_batch) + this_batch = [row] + n+=1 + pdf_y = 60 + cell_height # Whatever we want the starting y position to be on a new page + continue + + n+=1 + pdf_y += cell_height + this_batch.append(row) + batches.append(this_batch) + return batches + + + @staticmethod + def make_mlst_table(pdf, data): + contents = [["Sample ID","ST","flaA","pilE","asd","mip","mompS","proA","neuA"]] + for sample in data: + contents.append(sample) + col_widths = (50, 20, 20, 20, 20, 20, 20, 20, 20) + alignment = ("CENTER", "CENTER", "CENTER", "CENTER", "CENTER", "CENTER", "CENTER", "CENTER", "CENTER") + pdf = Report.make_table(pdf, contents, col_widths=col_widths, text_align=alignment) + return pdf + + @staticmethod + def read_jsons(files): + data = [] + for file in files: + with open(file) as fin: + json_data = json.load(fin) + data.append(Report.from_json(json_data)) + return data + + @staticmethod + def read_multi_json(files): + data = [] + with open(files) as fin: + json_data = json.load(fin) + for i in json_data: + data.append(Report.from_json(i)) + + return data + +class PDF(FPDF): + def header(self): + self.image("https://www.drought.gov/sites/default/files/styles/i_square_240_240/public/hero/partners/CDC_logo.png.webp?itok=Yja6wdoM", 10, 8, 33, keep_aspect_ratio=True) + self.set_font('Courier', '', 10) + self.cell(80) + self.multi_cell(h=2,w=0, txt=cdc_header, align="C") + self.ln(2) + +def main(): + with open(sys.argv[1]) as fin: + if fin.read().startswith("["): + data = Report.read_multi_json(sys.argv[1]) + else: + data = Report.read_jsons(sys.argv[1:]) + pdf = PDF('P', 'mm', 'Letter') + pdf.add_page() + pdf.set_font('Courier', 'B', 10) + pdf.cell(80) + pdf.multi_cell(h=4,w=0,txt="Epidemiology of Legionella: Genome-based Typing (el_gato) Batch Results Report", align="C") + pdf.ln(2) + pdf.set_font('Courier', '', 16) + pdf.multi_cell(w=0,h=6, txt=LOGO, new_x="LMARGIN", new_y="NEXT") + pdf.ln(4) + pdf.set_font('Courier', '', 11) + pdf.multi_cell(w=0,h=5, txt=summary_header, new_x="LMARGIN", new_y="NEXT") + pdf.ln(10) + content = [i.list_mlst() for i in data] + batches = Report.fit_table(pdf, content, pdf.get_y(), 19) + for batch in batches: + if batch != batches[-1]: + pdf.set_font('Courier', '', 11) + pdf = Report.make_mlst_table(pdf, batch) + pdf.add_page() + pdf.ln(10) + else: + pdf.set_font('Courier', '', 11) + pdf = Report.make_mlst_table(pdf, batch) + pdf.ln(5) + pdf.set_font(style="U") + pdf.cell(w=0,h=0, txt="Abbreviation Key", new_x="LMARGIN", new_y="NEXT") + pdf.ln(5) + pdf.set_font() + pdf.multi_cell(w=0,h=3.5, txt=abbrev_key, new_x="LMARGIN", new_y="NEXT") + + + for datum in data: + pdf = datum.sample_report(pdf) + + pdf.output("report.pdf") + +if __name__ == '__main__': + main() From 4e52ff6046116a6cf9da386ec7366dd78862f685 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 11:03:33 -0400 Subject: [PATCH 15/40] Updated nextflow --- run_el_gato.nf | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/run_el_gato.nf b/run_el_gato.nf index 90da9ac..f3d2a47 100644 --- a/run_el_gato.nf +++ b/run_el_gato.nf @@ -118,6 +118,20 @@ process FINAL_JSON { """ } +process FINAL_REPORT { + conda "-c conda-forge -c bioconda -c appliedbinf elgato" + publishDir params.out, mode: 'copy', overwrite: true + input: + path files + + output: + path 'report.pdf' + + """ + elgato_report.py *.json + """ +} + workflow { if (params.reads_dir) { @@ -126,6 +140,7 @@ workflow { files = RUN_EL_GATO_READS(readPairs).collect() CAT(files) FINAL_JSON(files) + FINAL_REPORT(files) } else { @@ -136,6 +151,7 @@ workflow { files = RUN_EL_GATO_ASSEMBLIES(assemblies).collect() CAT(files) FINAL_JSON(files) + FINAL_REPORT(files) } else { print "Please provide the path to a directory containing paired reads using --reads_dir or the path to a directory containing assemblies using --assembly_dir." From 7b34205478b9f8a885d0cb6996d1eb4df7f77d3a Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 11:03:58 -0400 Subject: [PATCH 16/40] bump conda version --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7c65af2..b041f1c 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,12 @@ setup( name="elgato", - version="1.14.0", + version="1.14.3", python_requires='>=3.8', scripts = [ 'el_gato/el_gato.py', - 'run_el_gato.nf' + 'run_el_gato.nf', + 'elgato_report.py' ], install_requires=[ "colorama; platform_system == 'Linux'", From 755141ad14f3959e4bdd84d27c4aaa3b7bffb1f9 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 11:04:18 -0400 Subject: [PATCH 17/40] Updates to json --- el_gato/el_gato.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/el_gato/el_gato.py b/el_gato/el_gato.py index 194195c..8f85583 100644 --- a/el_gato/el_gato.py +++ b/el_gato/el_gato.py @@ -1217,6 +1217,8 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str del ref.REF_POSITIONS[gene[0]] else: logging.info("Analysis of read mapping to neuA locus variants was unsuccessful. Unclear which to use.") + print(cov_sorted) + sys.exit() #ref.REF_POSITIONS = OrderedDict([(x, ref.REF_POSITIONS[x]) for loci in x]) @@ -1587,12 +1589,11 @@ def map_alleles(inputs: dict, ref: Ref): with open(f"{outdir}/intermediate_outputs.txt", "a") as fout: fout.write(message) - + if samfile == True: pass else: - os.remove(f"{outdir}/reads_vs_all_ref_filt.sam") - + os.remove(f"{outdir}/reads_vs_all_ref_filt.sam") return alleles From 4a5a4254e440b85fd7e21ce6acd0bee778e11750 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 11:04:38 -0400 Subject: [PATCH 18/40] bump conda version --- conda-recipe/meta.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 14ee284..5c1cf90 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "elgato" %} -{% set version = "1.14.0" %} +{% set version = "1.14.3" %} package: name: {{ name|lower }} @@ -27,6 +27,7 @@ requirements: - blast==2.13.0 - ispcr==33.0 - nextflow + - fpdf2 # build: # noarch: python From 994cc251a4d5cb544016e41803bfe419d98ee957 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 13:07:02 -0400 Subject: [PATCH 19/40] Bug fix --- elgato_report.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/elgato_report.py b/elgato_report.py index 6b24309..51a7962 100644 --- a/elgato_report.py +++ b/elgato_report.py @@ -251,7 +251,7 @@ def assembly_report(self, pdf, typeface, style, size): pdf.ln(4) pdf.cell( w=0,h=2, - txt="Length Identity and Sequence Identity: " + self.mode_specific['length_id'] +"; "+self.mode_specific['sequence_id'] + "%", + txt="BLAST Hit Length and Sequence Identity Thresholds: " + self.mode_specific['length_id'] +"; "+self.mode_specific['sequence_id'] + "%", new_x="LMARGIN", new_y="NEXT" ) pdf.ln(10) @@ -341,11 +341,11 @@ def fit_table(pdf, data, initial_y, characters:int): max_length = column_length num_lines = math.ceil(max_length / characters) cell_height = 2* num_lines * font_size - if pdf_y + cell_height > pdf.page_break_trigger: + if pdf_y + cell_height + 10 > pdf.page_break_trigger: batches.append(this_batch) this_batch = [row] n+=1 - pdf_y = 60 + cell_height # Whatever we want the starting y position to be on a new page + pdf_y = 60 # Whatever we want the starting y position to be on a new page continue n+=1 @@ -422,6 +422,9 @@ def main(): pdf.set_font('Courier', '', 11) pdf = Report.make_mlst_table(pdf, batch) pdf.ln(5) + if pdf.get_y() + 50 > pdf.page_break_trigger: + pdf.add_page() + pdf.ln(10) pdf.set_font(style="U") pdf.cell(w=0,h=0, txt="Abbreviation Key", new_x="LMARGIN", new_y="NEXT") pdf.ln(5) From 8f8c5867f58e84cf1ab91215b388694964f3a765 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 13:07:29 -0400 Subject: [PATCH 20/40] bump conda version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b041f1c..c72f368 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="elgato", - version="1.14.3", + version="1.14.4", python_requires='>=3.8', scripts = [ 'el_gato/el_gato.py', From 86ecd928f69f205bc46dddb13df096ac96e20311 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 13:07:48 -0400 Subject: [PATCH 21/40] bump conda version --- conda-recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 5c1cf90..da5d899 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "elgato" %} -{% set version = "1.14.3" %} +{% set version = "1.14.4" %} package: name: {{ name|lower }} From 723c6d31bd05e1d4a6e2492eebdf7a662c6f1d8a Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:27:39 -0400 Subject: [PATCH 22/40] Update README.md --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 0b92dca..21faea2 100755 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ El_gato is a bioinformatics tool that utilizes either a genome assembly (.fasta) * [identified_alleles.fna](#identified_allelesfna) * [run.log](#runlog) * [reads_vs_all_ref_filt_sorted.bam](#reads_vs_all_ref_filt_sortedbam-reads-only) + * [report.json](#report-json) * [How does el_gato work?](#approach) * [Using Nextflow](#using-nextflow) @@ -123,6 +124,10 @@ Optional arguments: Specify BLAST hit percent identity threshold for identifying multiple loci in an assembly (default: 95.0) +--samfile SAMFILE -m SAMFILE + Allows the user to include the reads_vs_all_ref_filt.sam + file to be included in the output directory. + (default: False) ``` # Input and Output From 628db86f7ac6216f17d6efb8f1ec0b0fa6729942 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:29:00 -0400 Subject: [PATCH 23/40] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 21faea2..f82cd3e 100755 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ Usage information printed when running el_gato.py with `-h` or `--help`. usage: el_gato.py [--read1 Read 1 file] [--read2 Read 2 file] [--assembly Assembly file] [--help] [--threads THREADS] [--depth DEPTH] [--out OUT] [--sample SAMPLE] [--overwrite] [--sbt SBT] [--suffix SUFFIX] [--profile PROFILE] [--verbose] [--header] [--length LENGTH] [--sequence SEQUENCE] +[--samfile SAMFILE] Legionella in silico sequence-based typing (SBT) script. Requires paired-end reads files or a genome assembly. @@ -126,7 +127,7 @@ Optional arguments: (default: 95.0) --samfile SAMFILE -m SAMFILE Allows the user to include the reads_vs_all_ref_filt.sam - file to be included in the output directory. + file to be included in the output directory (default: False) ``` From 323286f07165fe8b8cb4d09fc65a1c487b403bd5 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:35:24 -0400 Subject: [PATCH 24/40] Update README.md --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index f82cd3e..61ddf35 100755 --- a/README.md +++ b/README.md @@ -224,6 +224,13 @@ el_gato maps the provided reads to [a set of reference sequences in the el_gato **Note:** A SAM file is also present, which has the same information as in the BAM file. +### report.json +Each sample outputs a json file that contains relevant information about the run that will be included in the report PDF. +Summary page metadata: Complete MLST profile of the sample and the abbreviation key for the symbols. +Run-specific data: + Paired-end reads: Locus coverage information and *mompS* primer information. + Assembly: BLAST hit length and sequence identity thresholds and locus location information. + # Approach At its core, el_gato uses BLAST to identify the closest match to each allele in your input data. For the loci *flaA*, *pilE*, *asd*, *mip*, and *proA*, this process is straight forward. Whereas loci *mompS* and *neuA/neuAh* require more involved processing, with neuA/neuAh being an issue only when processing reads—the specifics of these loci are discussed in the corresponding sections below. From bfbb796a438898e7a1e7d4f38d41ab732eff13e9 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:36:48 -0400 Subject: [PATCH 25/40] Update README.md --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 61ddf35..641c0fd 100755 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ El_gato is a bioinformatics tool that utilizes either a genome assembly (.fasta) * [identified_alleles.fna](#identified_allelesfna) * [run.log](#runlog) * [reads_vs_all_ref_filt_sorted.bam](#reads_vs_all_ref_filt_sortedbam-reads-only) - * [report.json](#report-json) + * [report.json](#report.json) * [How does el_gato work?](#approach) * [Using Nextflow](#using-nextflow) @@ -226,10 +226,13 @@ el_gato maps the provided reads to [a set of reference sequences in the el_gato ### report.json Each sample outputs a json file that contains relevant information about the run that will be included in the report PDF. + Summary page metadata: Complete MLST profile of the sample and the abbreviation key for the symbols. + Run-specific data: - Paired-end reads: Locus coverage information and *mompS* primer information. - Assembly: BLAST hit length and sequence identity thresholds and locus location information. +Paired-end reads: Locus coverage information and *mompS* primer information. + +Assembly: BLAST hit length and sequence identity thresholds and locus location information. # Approach From 33420d035ab6906ac0bfdf18953c41b878a360e3 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:38:34 -0400 Subject: [PATCH 26/40] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 641c0fd..f77e880 100755 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ El_gato is a bioinformatics tool that utilizes either a genome assembly (.fasta) * [identified_alleles.fna](#identified_allelesfna) * [run.log](#runlog) * [reads_vs_all_ref_filt_sorted.bam](#reads_vs_all_ref_filt_sortedbam-reads-only) - * [report.json](#report.json) + * [report.json](#report-json) * [How does el_gato work?](#approach) * [Using Nextflow](#using-nextflow) @@ -224,12 +224,13 @@ el_gato maps the provided reads to [a set of reference sequences in the el_gato **Note:** A SAM file is also present, which has the same information as in the BAM file. -### report.json +### report-json Each sample outputs a json file that contains relevant information about the run that will be included in the report PDF. Summary page metadata: Complete MLST profile of the sample and the abbreviation key for the symbols. Run-specific data: + Paired-end reads: Locus coverage information and *mompS* primer information. Assembly: BLAST hit length and sequence identity thresholds and locus location information. From 34e5bda8c79bd9d37cd164a34fa285f596b16e34 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:38:58 -0400 Subject: [PATCH 27/40] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f77e880..b0799ba 100755 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ El_gato is a bioinformatics tool that utilizes either a genome assembly (.fasta) * [identified_alleles.fna](#identified_allelesfna) * [run.log](#runlog) * [reads_vs_all_ref_filt_sorted.bam](#reads_vs_all_ref_filt_sortedbam-reads-only) - * [report.json](#report-json) + * [report.json](#reportjson) * [How does el_gato work?](#approach) * [Using Nextflow](#using-nextflow) From 2e677704e6354e8157f332c92b336d5b6349fcff Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:52:01 -0400 Subject: [PATCH 28/40] Update README.md --- README.md | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b0799ba..6690fa6 100755 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ El_gato is a bioinformatics tool that utilizes either a genome assembly (.fasta) * [report.json](#reportjson) * [How does el_gato work?](#approach) * [Using Nextflow](#using-nextflow) +* [Reporting Module](#reportingmodule) Codebase stage: development Developers and maintainers, Testers: [Andrew Conley](https://github.com/abconley), [Lavanya Rishishwar](https://github.com/lavanyarishishwar), [Emily T. Norris](https://github.com/norriset), [Anna Gaines](https://github.com/annagaines), [Will Overholt](https://github.com/waoverholt/), [Dev Mashruwala](https://github.com/dmashruwala), [Alan Collins](https://github.com/Alan-Collins) @@ -317,4 +318,16 @@ nextflow run_el_gato.nf --reads_dir --threads --threads --out ``` ## Output files for Nextflow -At the completion of a run, the specified output directory (default: el_gato_out/) will contain a file named "all_mlst.txt" (the MLST profile of each sample) and one directory for each sample processed. Each sub-directory is named with a sample name and contains output files specific to that sample. These files include the el_gato log file and files providing more details about the sequences identified in the sample. +At the completion of a run, the specified output directory (default: el_gato_out/) will contain a file named "all_mlst.txt" (the MLST profile of each sample) and one directory for each sample processed. Each sub-directory is named with a sample name and contains output files specific to that sample. These files include the el_gato log file and files providing more details about the sequences identified in the sample. + +Additionally, the specified output directory will contain a combined json file (report.json) that contains all of the data from the individual sample-level json files and the report PDF (report.pdf). + +#Reporting Module + +We provide a script that generates a PDF report of each el_gato run using the report.json file generated in the output folder for each sample. +This report generated by Nextflow by default, but must be run manually if running el_gato for individual samples. If run manually, the report PDF +will output to the current working directory. + +``` +elgato_report.py /report.json +``` From 314a7e5bc3ee99ae77ce24c3b943923292933da1 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:52:30 -0400 Subject: [PATCH 29/40] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 6690fa6..f1b736c 100755 --- a/README.md +++ b/README.md @@ -327,7 +327,6 @@ Additionally, the specified output directory will contain a combined json file ( We provide a script that generates a PDF report of each el_gato run using the report.json file generated in the output folder for each sample. This report generated by Nextflow by default, but must be run manually if running el_gato for individual samples. If run manually, the report PDF will output to the current working directory. - ``` elgato_report.py /report.json ``` From 66b2777f3ffbd2af76a5fe0950c61574dbf518dd Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:52:48 -0400 Subject: [PATCH 30/40] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f1b736c..3ffc28f 100755 --- a/README.md +++ b/README.md @@ -325,7 +325,7 @@ Additionally, the specified output directory will contain a combined json file ( #Reporting Module We provide a script that generates a PDF report of each el_gato run using the report.json file generated in the output folder for each sample. -This report generated by Nextflow by default, but must be run manually if running el_gato for individual samples. If run manually, the report PDF +This report generated by Nextflow by default, but must be run manually if running el_gato for individual samples. If run manually, the report PDF will output to the current working directory. ``` elgato_report.py /report.json From fd3208a98db28820f5f309b27c2d964ed067ac7f Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:53:20 -0400 Subject: [PATCH 31/40] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3ffc28f..2346c8e 100755 --- a/README.md +++ b/README.md @@ -322,7 +322,7 @@ At the completion of a run, the specified output directory (default: el_gato_out Additionally, the specified output directory will contain a combined json file (report.json) that contains all of the data from the individual sample-level json files and the report PDF (report.pdf). -#Reporting Module +# Reporting Module We provide a script that generates a PDF report of each el_gato run using the report.json file generated in the output folder for each sample. This report generated by Nextflow by default, but must be run manually if running el_gato for individual samples. If run manually, the report PDF From cb011b7cc75653a56632e7c788651882049b9224 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:53:42 -0400 Subject: [PATCH 32/40] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2346c8e..b44ef36 100755 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ El_gato is a bioinformatics tool that utilizes either a genome assembly (.fasta) * [report.json](#reportjson) * [How does el_gato work?](#approach) * [Using Nextflow](#using-nextflow) -* [Reporting Module](#reportingmodule) +* [Reporting Module](#reporting-module) Codebase stage: development Developers and maintainers, Testers: [Andrew Conley](https://github.com/abconley), [Lavanya Rishishwar](https://github.com/lavanyarishishwar), [Emily T. Norris](https://github.com/norriset), [Anna Gaines](https://github.com/annagaines), [Will Overholt](https://github.com/waoverholt/), [Dev Mashruwala](https://github.com/dmashruwala), [Alan Collins](https://github.com/Alan-Collins) From 4968cda078f168d6fa26949c228bd600a539defa Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:56:11 -0400 Subject: [PATCH 33/40] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b44ef36..1eee4d2 100755 --- a/README.md +++ b/README.md @@ -322,7 +322,10 @@ At the completion of a run, the specified output directory (default: el_gato_out Additionally, the specified output directory will contain a combined json file (report.json) that contains all of the data from the individual sample-level json files and the report PDF (report.pdf). -# Reporting Module +# Reporting Module + +## Dependencies + * [fpdf2](https://github.com/py-pdf/fpdf2) We provide a script that generates a PDF report of each el_gato run using the report.json file generated in the output folder for each sample. This report generated by Nextflow by default, but must be run manually if running el_gato for individual samples. If run manually, the report PDF From e9d6cc37038bf10ce21459b14649b486fe743cf4 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Thu, 28 Sep 2023 15:56:35 -0400 Subject: [PATCH 34/40] Update README.md --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 1eee4d2..785136d 100755 --- a/README.md +++ b/README.md @@ -323,7 +323,6 @@ At the completion of a run, the specified output directory (default: el_gato_out Additionally, the specified output directory will contain a combined json file (report.json) that contains all of the data from the individual sample-level json files and the report PDF (report.pdf). # Reporting Module - ## Dependencies * [fpdf2](https://github.com/py-pdf/fpdf2) From 8ce87fd7b3077b2fe3a0ecd0a186a13ce16f2a11 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Fri, 29 Sep 2023 11:30:26 -0400 Subject: [PATCH 35/40] Bug fix for parameters --- run_el_gato.nf | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/run_el_gato.nf b/run_el_gato.nf index f3d2a47..f8b28b7 100644 --- a/run_el_gato.nf +++ b/run_el_gato.nf @@ -8,6 +8,7 @@ params.depth = 10 params.length = 0.3 params.sequence = 95.0 params.out = 'el_gato_out' +params.samfile = false process RUN_EL_GATO_READS { conda "-c conda-forge -c bioconda -c appliedbinf elgato" @@ -26,8 +27,29 @@ process RUN_EL_GATO_READS { r1 = reads[0] r2 = reads[1] + if (params.samfile == true) { + + """ + mkdir ${sampleId}_out/ + + el_gato.py \ + -1 $r1 \ + -2 $r2 \ + -o ${sampleId}_out \ + -t ${task.cpus} \ + -d $params.depth \ + -m \ + -w > mlst.txt + + mv mlst.txt ${sampleId}_out/ + + for file in \$(ls ${sampleId}_out/); do + mv ${sampleId}_out/\$file ${sampleId}_out/${sampleId}_\$file + done """ +}else{ + """ mkdir ${sampleId}_out/ el_gato.py \ @@ -45,6 +67,7 @@ process RUN_EL_GATO_READS { done """ + } } process RUN_EL_GATO_ASSEMBLIES { From a6dd13006d2c76b49a938a2a9833ac0995a45193 Mon Sep 17 00:00:00 2001 From: Dev Mashruwala <31735258+dmashruwala@users.noreply.github.com> Date: Fri, 29 Sep 2023 14:15:16 -0400 Subject: [PATCH 36/40] Fix --- elgato_report.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/elgato_report.py b/elgato_report.py index 51a7962..a012092 100644 --- a/elgato_report.py +++ b/elgato_report.py @@ -37,13 +37,14 @@ failure of allele identification.\ """ -cdc_header = """\ -National Center for Immunization and Respiratory \n -Diseases (NCIRD) \n -Division of Bacterial Diseases (DBD) / \n -Respiratory Diseases Branch (RDB) \n -Pneumonia Response and Surveillance Laboratory (PRSL)\ +bioconda_header = """\ +El_gato Reports \n +Used for Batch and Sample-Level Summaries \n +Developed by Applied Bioinformatics Laboratory \n +(ABiL) \n +2023\ """ + abbrev_key = """\ Novel ST = The alleles for all 7 loci were identified, however their unique combination and corresponding ST has not been found in the database. \n Novel ST* = One or more locus failed to amplify, which may indicate a novel allele. \n @@ -386,10 +387,10 @@ def read_multi_json(files): class PDF(FPDF): def header(self): - self.image("https://www.drought.gov/sites/default/files/styles/i_square_240_240/public/hero/partners/CDC_logo.png.webp?itok=Yja6wdoM", 10, 8, 33, keep_aspect_ratio=True) + self.image("https://en.vircell.com/media/filer_public_thumbnails/filer_public/48/18/48184d99-1af0-46ad-a0ad-fcb65fa7b177/fotolia_7966544_xxlweb.jpg__409x999_q85_subsampling-2_upscale.jpg", 10, 8, 33, keep_aspect_ratio=True) self.set_font('Courier', '', 10) self.cell(80) - self.multi_cell(h=2,w=0, txt=cdc_header, align="C") + self.multi_cell(h=2,w=0, txt=bioconda_header, align="C") self.ln(2) def main(): From d63ef05b70514df52c486997c12911b194100a33 Mon Sep 17 00:00:00 2001 From: waoverholt Date: Wed, 4 Oct 2023 19:18:35 -0400 Subject: [PATCH 37/40] Fixed minor bug - case when both reads were secondary matches caused index error --- el_gato/el_gato.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/el_gato/el_gato.py b/el_gato/el_gato.py index 8f85583..51938c0 100644 --- a/el_gato/el_gato.py +++ b/el_gato/el_gato.py @@ -1346,8 +1346,10 @@ def process_reads(contig_dict: dict, read_info_dict: dict, ref: Ref, outdir: str if len(set(calls)) > 1: conflicting_reads.append(calls_list) break - else: + elif len(set(calls)) == 1: agreeing_calls.append(calls[0]) + else: + continue if len(agreeing_calls) != 0: read_pair_base_calls.append("".join(agreeing_calls)) From 1733c67de838dab85e1188dbb7bae4a91725ab23 Mon Sep 17 00:00:00 2001 From: waoverholt Date: Wed, 4 Oct 2023 19:55:20 -0400 Subject: [PATCH 38/40] added versioning control --- el_gato/el_gato.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/el_gato/el_gato.py b/el_gato/el_gato.py index 51938c0..1da42b2 100644 --- a/el_gato/el_gato.py +++ b/el_gato/el_gato.py @@ -13,9 +13,11 @@ import math import json from collections import defaultdict, Counter, OrderedDict +from pkg_resources import get_distribution t0 = time.time() script_filename = inspect.getframeinfo(inspect.currentframe()).filename script_path = os.path.dirname(os.path.abspath(script_filename)) +version = get_distribution('elgato').version class Ref: file = "Ref_Paris_mompS_2.fasta" @@ -196,6 +198,7 @@ def get_args() -> argparse.ArgumentParser: metavar="Assembly file") group2 = parser.add_argument_group(title='Optional arguments') group2.add_argument("--help", "-h", action="help", help="Show this help message and exit") + group2.add_argument("--version", "-v", help="Print the version", default=False, action='store_true') group2.add_argument("--threads", "-t", help="Number of threads to run the programs (default: %(default)s)", type=int, required=False, default=1) group2.add_argument("--depth", "-d", help="Specify the minimum depth used to identify loci in paired-end reads (default: %(default)s)", type=int, required=False, @@ -212,7 +215,7 @@ def get_args() -> argparse.ArgumentParser: required=False, default="_alleles.tfa") group2.add_argument("--profile", "-p", help="Name of allele profile to ST mapping file (default: %(default)s)", type=str, required=False, default=os.path.join(os.path.dirname(__file__), "db", "lpneumophila.txt")) - group2.add_argument("--verbose", "-v", help="Print what the script is doing (default: %(default)s)", + group2.add_argument("--verbose", help="Print what the script is doing (default: %(default)s)", action="store_true", required=False, default=False) group2.add_argument("--header", "-e", help="Include column headers in the output table (default: %(default)s)", action="store_true", required=False, default=False), group2.add_argument("--length", "-l", help="Specify the BLAST hit length threshold for identifying multiple loci in assembly (default: %(default)s)", type = float, required=False, default=0.3), @@ -222,7 +225,6 @@ def get_args() -> argparse.ArgumentParser: return parser - def fasta_to_dict(FASTA_file: str) -> dict: """Read a fasta file into a dict Dict has headers (minus the > symbol) as keys and the associated @@ -1811,6 +1813,9 @@ def main(): parser = get_args() args = parser.parse_args() + if args.version: + print(version) + sys.exit() inputs = check_input_supplied(args, parser, inputs) inputs = set_inputs(args, inputs) make_output_directory(inputs) From e9383a8a36813e1df5eba9dfdfe9f7247daf34df Mon Sep 17 00:00:00 2001 From: waoverholt Date: Wed, 4 Oct 2023 22:54:36 -0400 Subject: [PATCH 39/40] fixed issue with python 3.12 --- conda-recipe/meta.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index da5d899..ceb7d18 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -17,11 +17,11 @@ requirements: - skip: True # [win] host: - - python >=3.8 + - python >=3.8,<3.12 - pip run: - - python >=3.8 + - python >=3.8,<3.12 - minimap2==2.24 - samtools==1.15.1 - blast==2.13.0 From b5c6791dded70fabf9206a6bb005ce7031c183ea Mon Sep 17 00:00:00 2001 From: waoverholt Date: Wed, 4 Oct 2023 22:55:04 -0400 Subject: [PATCH 40/40] removed depreciated package for version control --- el_gato/el_gato.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/el_gato/el_gato.py b/el_gato/el_gato.py index 1da42b2..ac9da07 100644 --- a/el_gato/el_gato.py +++ b/el_gato/el_gato.py @@ -13,11 +13,13 @@ import math import json from collections import defaultdict, Counter, OrderedDict -from pkg_resources import get_distribution +#from pkg_resources import get_distribution +from importlib import metadata t0 = time.time() script_filename = inspect.getframeinfo(inspect.currentframe()).filename script_path = os.path.dirname(os.path.abspath(script_filename)) -version = get_distribution('elgato').version +#version = get_distribution('elgato').version +version = metadata.version('elgato') class Ref: file = "Ref_Paris_mompS_2.fasta" @@ -1814,7 +1816,7 @@ def main(): parser = get_args() args = parser.parse_args() if args.version: - print(version) + print(f'el_gato version: {version}') sys.exit() inputs = check_input_supplied(args, parser, inputs) inputs = set_inputs(args, inputs)