diff --git a/CHANGELOG.md b/CHANGELOG.md index d24460e..315dd6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ________________________________________________________________ #### Current Releases: -**Release v1.4.1 Highlights:** +**Release v1.4.2 Highlights:** * **`VEBA` Modules:** @@ -369,6 +369,7 @@ There was a problem importing veba_output/misc/reads_table.tsv: **Definitely:** +* Add option to `compile_custom_humann_database_from_annotations.py` to only output best hit of a UniRef identifier per genome. * Use `pigz` instead of `gzip` * Create a taxdump for `MicroEuk` * Reimplement `compile_eukaryotic_classifications.py` @@ -405,6 +406,8 @@ ________________________________________________________________
**Daily Change Log:** +* [2023.12.21] - `GTDB-Tk` changed name of archaea summary file so VEBA was not adding this to final classification. Fixed this in `classify-prokaryotic.py`. +* [2023.12.20] - Fixed files not being closed in `compile_custom_humann_database_from_annotations.py` and added options to use different annotation file formats (i.e., multilevel, header, and no header). * [2023.12.15] - Added `profile-taxonomic.py` module which uses `sylph` to build a sketch database for genomes and queries the genome database similar to `Kraken` for taxonomic abundance. * [2023.12.14] - Removed requirement to have `--estimated_assembly_size` for Flye per [Flye Issue #652](https://github.com/fenderglass/Flye/issues/652). * [2023.12.14] - Added `sylph` to `VEBA-profile_env` for abundance profiling of genomes. diff --git a/MODULE_RESOURCES.xlsx b/MODULE_RESOURCES.xlsx index bf99e33..232034d 100644 Binary files a/MODULE_RESOURCES.xlsx and b/MODULE_RESOURCES.xlsx differ diff --git a/README.md b/README.md index 3d2973b..17e5240 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ ___________________________________________________________________ ### Announcements -* **What's new in `VEBA v1.4.1`?** +* **What's new in `VEBA v1.4.2`?** * **`VEBA` Modules:** @@ -67,7 +67,7 @@ ___________________________________________________________________ ### Installation and databases -**Current Stable Version:** [`v1.4.1`](https://github.com/jolespin/veba/releases/tag/v1.4.1) +**Current Stable Version:** [`v1.4.2`](https://github.com/jolespin/veba/releases/tag/v1.4.2) **Current Database Version:** `VDB_v6` diff --git a/VERSION b/VERSION index 35cf735..8fc3b86 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -1.4.1 +1.4.2 VDB_v6 diff --git a/install/README.md b/install/README.md index 7f796d0..67414bf 100644 --- a/install/README.md +++ b/install/README.md @@ -85,7 +85,7 @@ The `VEBA` installation is going to configure some `conda` environments for you ``` # For stable version, download and decompress the tarball: -VERSION="1.4.1" +VERSION="1.4.2" wget https://github.com/jolespin/veba/archive/refs/tags/v${VERSION}.tar.gz tar -xvf v${VERSION}.tar.gz && mv veba-${VERSION} veba diff --git a/src/classify-prokaryotic.py b/src/classify-prokaryotic.py index e6f1d47..e4b6428 100755 --- a/src/classify-prokaryotic.py +++ b/src/classify-prokaryotic.py @@ -15,7 +15,7 @@ pd.options.display.max_colwidth = 100 # from tqdm import tqdm __program__ = os.path.split(sys.argv[0])[-1] -__version__ = "2023.11.30" +__version__ = "2023.12.21" # GTDB-Tk def get_gtdbtk_cmd( input_filepaths, output_filepaths, output_directory, directories, opts): @@ -93,8 +93,8 @@ def get_gtdbtk_cmd( input_filepaths, output_filepaths, output_directory, directo os.environ["concatenate_dataframes.py"], "--axis 0", "--allow_empty_or_missing_files", - os.path.join(output_directory, "classify", "gtdbtk.ar122.summary.tsv"), - os.path.join(output_directory, "classify", "gtdbtk.bac120.summary.tsv"), + os.path.join(output_directory, "classify", "gtdbtk.ar*.summary.tsv"), + os.path.join(output_directory, "classify", "gtdbtk.bac*.summary.tsv"), ">", os.path.join(directories["output"], "taxonomy.tsv"), diff --git a/src/scripts/compile_custom_humann_database_from_annotations.py b/src/scripts/compile_custom_humann_database_from_annotations.py index 6604413..dffbcf8 100755 --- a/src/scripts/compile_custom_humann_database_from_annotations.py +++ b/src/scripts/compile_custom_humann_database_from_annotations.py @@ -11,7 +11,7 @@ pd.options.display.max_colwidth = 100 # from tqdm import tqdm __program__ = os.path.split(sys.argv[0])[-1] -__version__ = "2023.12.15" +__version__ = "2023.12.20" def main(args=None): @@ -21,23 +21,27 @@ def main(args=None): # Path info description = """ Running: {} v{} via Python v{} | {}""".format(__program__, __version__, sys.version.split(" ")[0], sys.executable) - usage = "{} -i -a -t -o ".format(__program__) + usage = "{} -i -a -t -o ".format(__program__) epilog = "Copyright 2021 Josh L. Espinoza (jespinoz@jcvi.org)" # Parser parser = argparse.ArgumentParser(description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter) # Pipeline parser.add_argument("-i","--identifier_mapping", default="stdin", type=str, help = "path/to/identifier_mapping.tsv[.gz] [id_protein][id_genome] (No header) [Default: stdin]") - parser.add_argument("-a","--annotations", type=str, required=True, help = "path/to/annotations.tsv[.gz] Output from annotations.py. Multi-level header that contains (UniRef, sseqid)") + parser.add_argument("-a","--annotations", type=str, required=True, help = "path/to/annotations.tsv[.gz] Output from annotations.py. First column must be qseqid. See --annotation_header_mode") parser.add_argument("-t","--taxonomy", type=str, required=True, help = "path/to/taxonomy.tsv[.gz] [id_genome][classification] (No header). Use output from `merge_taxonomy_classifications.py` with --no_header and --no_domain") parser.add_argument("-s","--sequences", type=str, required=True, help = "path/to/proteins.fasta[.gz]") - parser.add_argument("-o","--output", type=str, default="stdout", help = "path/to/humann_uniref_annotations.tsv[.gz] (veba_output/profiling/databases/) is recommended [Default: stdout]") + parser.add_argument("-o","--output_table", type=str, default="stdout", help = "path/to/humann_uniref_annotations.tsv[.gz] (veba_output/profiling/databases/) is recommended [Default: stdout]") + parser.add_argument("--output_fasta", type=str, help = "path/to/humann_uniref.fasta[.gz] (Recommended to add to veba_output/tmp/ and then build the diamond base in veba_output/profiling/databases/ )") parser.add_argument("--sep", default=";", help = "Separator for taxonomic levels [Default: ;]") # parser.add_argument("--mandatory_taxonomy_prefixes", help = "Comma-separated values for mandatory prefix levels. (e.g., 'c__,f__,g__,s__')") # parser.add_argument("--discarded_file", help = "Proteins that have been discarded due to incomplete lineage") parser.add_argument("-g", "--no_append_genome_identifier", action="store_true", help = "Don't add genome to taxonomic lineage") parser.add_argument("--genome_prefix", type=str, default="t__", help = "Taxonomic level prefix for genome") parser.add_argument("--header", action="store_true", help = "Write header") + parser.add_argument("-m", "--annotation_header_mode", default="multilevel", choices={"header", "multilevel", "no_header"}, help = "If --annotation_header_mode == 'multiindex' header assumes that contains (UniRef, sseqid), --annotation_header_mode == 'header' assumes one-level header that contains `sseqid`, --annotation_header_mode == 'no_header' assumes there is no header [Default: multilevel]") + parser.add_argument("--sseqid_index", type=int, default=1, help = "Python indexing for sseqid position if --annotation_header_mode == no_header. Assumes qseqid is index=0. [Default: 1]") + # Options opts = parser.parse_args() @@ -46,49 +50,102 @@ def main(args=None): if opts.identifier_mapping == "stdin": opts.identifier_mapping = sys.stdin + else: + if opts.identifier_mapping.endswith(".gz"): + opts.identifier_mapping = gzip.open(opts.identifier_mapping, "rt") + else: + opts.identifier_mapping = open(opts.identifier_mapping, "r") + + if opts.output_table == "stdout": + opts.output_table = sys.stdout + + if opts.output_fasta is not None: + if opts.output_fasta.endswith(".gz"): + opts.output_fasta = gzip.open(opts.output_fasta, "wt") + else: + opts.output_fasta = open(opts.output_fasta, "w") - if opts.output == "stdout": - opts.output = sys.stdout # Proteins to genomes - protein_to_genome = pd.read_csv(opts.identifier_mapping, sep="\t", index_col=0, header=None).iloc[:,0] - A1 = set(protein_to_genome.index) - A2= set(protein_to_genome.values) + # protein_to_genome = pd.read_csv(opts.identifier_mapping, sep="\t", index_col=0, header=None).iloc[:,0] + protein_to_genome = dict() + for line in tqdm(opts.identifier_mapping, desc="Getting genome for each protein: {}".format(opts.identifier_mapping), unit=" Proteins"): + line = line.strip() + if line: + id_protein, id_genome = line.split("\t") + protein_to_genome[id_protein] = id_genome + + A1 = set(protein_to_genome.keys()) + A2= set(protein_to_genome.values()) print("--identifier_mapping", opts.identifier_mapping, file=sys.stderr) print(" * {} proteins".format(len(A1)), file=sys.stderr) print(" * {} genomes".format(len(A2)), file=sys.stderr) # Annotations - df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=[0,1]) - assert "UniRef" in df_annotations.columns.get_level_values(0), "--annotations must have a 2 level header (i.e., Pandas MultiIndex with 2 levels) where the first level has 'UniRef' as created by `annotate.py`" - df_annotations = df_annotations["UniRef"] - protein_to_uniref = df_annotations["sseqid"].dropna() + if opts.annotation_header_mode == "no_header": + df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=None) + protein_to_uniref = df_annotations.iloc[:,opts.sseqid_index - 1].dropna() + else: + if opts.annotation_header_mode == "multilevel": + df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=[0,1]) + assert "UniRef" in df_annotations.columns.get_level_values(0), "--annotations must have a 2 level header (i.e., Pandas MultiIndex with 2 levels) where the first level has 'UniRef' as created by `annotate.py`" + df_annotations = df_annotations["UniRef"] + else: + # Annotations + df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0) + protein_to_uniref = df_annotations["sseqid"].dropna() + B1 = set(protein_to_uniref.index) B2 = set(protein_to_uniref.values) print("--annotations", opts.annotations, file=sys.stderr) print(" * {} proteins".format(len(B1)), file=sys.stderr) print(" * {} UniRef hits".format(len(B2)), file=sys.stderr) + # Taxonomy - genome_to_taxonomy = pd.read_csv(opts.taxonomy, sep="\t", index_col=0, header=None).iloc[:,0] - C1 = set(genome_to_taxonomy.index) - C2 = set(genome_to_taxonomy.values) + # genome_to_taxonomy = pd.read_csv(opts.taxonomy, sep="\t", index_col=0, header=None).iloc[:,0] + if opts.taxonomy.endswith(".gz"): + f_taxonomy = gzip.open(opts.taxonomy, "rt") + else: + f_taxonomy = open(opts.taxonomy, "r") + + genome_to_taxonomy = dict() + for line in tqdm(f_taxonomy, desc="Getting taxonomy for each genome: {}".format(opts.taxonomy), unit=" Genomes"): + line = line.strip() + if line: + id_genome, taxonomy = line.split("\t") + genome_to_taxonomy[id_genome] = taxonomy + f_taxonomy.close() + + C1 = set(genome_to_taxonomy.keys()) + C2 = set(genome_to_taxonomy.values()) print("--taxonomy", opts.taxonomy, file=sys.stderr) print(" * {} genomes".format(len(C1)), file=sys.stderr) print(" * {} taxonomic classifications".format(len(C2)), file=sys.stderr) if opts.sequences.endswith(".gz"): - f = gzip.open(opts.sequences, "rt") + f_sequences = gzip.open(opts.sequences, "rt") else: - f = open(opts.sequences, "r") + f_sequences = open(opts.sequences, "r") - protein_to_length = dict() - for header, seq in tqdm(SimpleFastaParser(f), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"): - id = header.split(" ")[0] - protein_to_length[id] = len(seq) - protein_to_length = pd.Series(protein_to_length) - D = set(protein_to_length.index) + if opts.output_fasta is None: + protein_to_length = dict() + for header, seq in tqdm(SimpleFastaParser(f_sequences), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"): + id = header.split(" ")[0] + if id in B1: + protein_to_length[id] = len(seq) + else: + protein_to_length = dict() + for header, seq in tqdm(SimpleFastaParser(f_sequences), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"): + id = header.split(" ")[0] + if id in B1: + protein_to_length[id] = len(seq) + print(">{}\n{}".format(id, seq), file=opts.output_fasta) + opts.output_fasta.close() + f_sequences.close() + + D = set(protein_to_length.keys()) # Checks assert A1 >= B1, "Not all proteins in --annotations are in --identifier_mapping." @@ -98,22 +155,19 @@ def main(args=None): # Append genome to taxonomy if not opts.no_append_genome_identifier: tmp = dict() - for id_genome, taxonomy in genome_to_taxonomy.items(): + for id_genome, taxonomy in tqdm(genome_to_taxonomy.items(), desc="Adding genome identifiers"): tmp[id_genome] = "{}{}{}{}".format(taxonomy, opts.sep, opts.genome_prefix, id_genome) - genome_to_taxonomy = pd.Series(tmp) + genome_to_taxonomy = tmp # id_protein, uniref_hit, len, lineage df_output = protein_to_uniref.to_frame("UniRef") - df_output["Length"] = protein_to_length[protein_to_uniref.index] - df_output["Taxonomy"] = protein_to_uniref.index.map(lambda id_protein: genome_to_taxonomy[protein_to_genome[id_protein]]) + df_output["Length"] = df_output.index.map(lambda id_protein: protein_to_length[id_protein]) + df_output["Taxonomy"] = df_output.index.map(lambda id_protein: genome_to_taxonomy[protein_to_genome[id_protein]]) df_output.index.name = "id_protein" - - # if opts.discarded_file: - - df_output.to_csv(opts.output, sep="\t", header=bool(opts.header)) + df_output.to_csv(opts.output_table, sep="\t", header=bool(opts.header)) if __name__ == "__main__": main()