v1.4.2

jolespin · Dec 22, 2023 · 8502b7a · 8502b7a
1 parent 70161aa
commit 8502b7a
Show file tree

Hide file tree

Showing 7 changed files with 97 additions and 40 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,7 +6,7 @@ ________________________________________________________________
 
 #### Current Releases:
 
-**Release v1.4.1 Highlights:**
+**Release v1.4.2 Highlights:**
 
 * **`VEBA` Modules:**
 
@@ -369,6 +369,7 @@ There was a problem importing veba_output/misc/reads_table.tsv:
 
 **Definitely:**
 
+* Add option to `compile_custom_humann_database_from_annotations.py` to only output best hit of a UniRef identifier per genome.
 * Use `pigz` instead of `gzip`
 * Create a taxdump for `MicroEuk`
 * Reimplement `compile_eukaryotic_classifications.py`
@@ -405,6 +406,8 @@ ________________________________________________________________
 <details>
 	<summary>**Daily Change Log:**</summary>
 
+* [2023.12.21] - `GTDB-Tk` changed name of archaea summary file so VEBA was not adding this to final classification. Fixed this in `classify-prokaryotic.py`.
+* [2023.12.20] - Fixed files not being closed in `compile_custom_humann_database_from_annotations.py` and added options to use different annotation file formats (i.e., multilevel, header, and no header).
 * [2023.12.15] - Added `profile-taxonomic.py` module which uses `sylph` to build a sketch database for genomes and queries the genome database similar to `Kraken` for taxonomic abundance.
 * [2023.12.14] - Removed requirement to have `--estimated_assembly_size` for Flye per [Flye Issue #652](https://github.com/fenderglass/Flye/issues/652).
 * [2023.12.14] - Added `sylph` to `VEBA-profile_env` for abundance profiling of genomes.

diff --git a/MODULE_RESOURCES.xlsx b/MODULE_RESOURCES.xlsx
diff --git a/README.md b/README.md
@@ -42,7 +42,7 @@ ___________________________________________________________________
 
 ### Announcements
 
-* **What's new in `VEBA v1.4.1`?**
+* **What's new in `VEBA v1.4.2`?**
 
 * **`VEBA` Modules:**
 
@@ -67,7 +67,7 @@ ___________________________________________________________________
 
 ### Installation and databases
 
-**Current Stable Version:** [`v1.4.1`](https://github.com/jolespin/veba/releases/tag/v1.4.1)
+**Current Stable Version:** [`v1.4.2`](https://github.com/jolespin/veba/releases/tag/v1.4.2)
 
 **Current Database Version:** `VDB_v6`
 

diff --git a/VERSION b/VERSION
@@ -1,2 +1,2 @@
-1.4.1
+1.4.2
 VDB_v6
diff --git a/install/README.md b/install/README.md
@@ -85,7 +85,7 @@ The `VEBA` installation is going to configure some `conda` environments for you
 ```
 # For stable version, download and decompress the tarball:
 
-VERSION="1.4.1"
+VERSION="1.4.2"
 wget https://github.com/jolespin/veba/archive/refs/tags/v${VERSION}.tar.gz
 tar -xvf v${VERSION}.tar.gz && mv veba-${VERSION} veba
 

diff --git a/src/classify-prokaryotic.py b/src/classify-prokaryotic.py
@@ -15,7 +15,7 @@
 pd.options.display.max_colwidth = 100
 # from tqdm import tqdm
 __program__ = os.path.split(sys.argv[0])[-1]
-__version__ = "2023.11.30"
+__version__ = "2023.12.21"
 
 # GTDB-Tk
 def get_gtdbtk_cmd( input_filepaths, output_filepaths, output_directory, directories, opts):
@@ -93,8 +93,8 @@ def get_gtdbtk_cmd( input_filepaths, output_filepaths, output_directory, directo
         os.environ["concatenate_dataframes.py"],
         "--axis 0",
         "--allow_empty_or_missing_files",
-        os.path.join(output_directory, "classify", "gtdbtk.ar122.summary.tsv"),
-        os.path.join(output_directory, "classify", "gtdbtk.bac120.summary.tsv"),
+        os.path.join(output_directory, "classify", "gtdbtk.ar*.summary.tsv"),
+        os.path.join(output_directory, "classify", "gtdbtk.bac*.summary.tsv"),
         ">",
         os.path.join(directories["output"], "taxonomy.tsv"),
 

diff --git a/src/scripts/compile_custom_humann_database_from_annotations.py b/src/scripts/compile_custom_humann_database_from_annotations.py
@@ -11,7 +11,7 @@
 pd.options.display.max_colwidth = 100
 # from tqdm import tqdm
 __program__ = os.path.split(sys.argv[0])[-1]
-__version__ = "2023.12.15"
+__version__ = "2023.12.20"
 
 
 def main(args=None):
@@ -21,23 +21,27 @@ def main(args=None):
     # Path info
     description = """
     Running: {} v{} via Python v{} | {}""".format(__program__, __version__, sys.version.split(" ")[0], sys.executable)
-    usage = "{} -i <identifier_mapping> -a <annotations> -t <taxnomy> -o <output_table>".format(__program__)
+    usage = "{} -i <identifier_mapping> -a <annotations> -t <taxonomy> -o <output_table>".format(__program__)
     epilog = "Copyright 2021 Josh L. Espinoza ([email protected])"
 
     # Parser
     parser = argparse.ArgumentParser(description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
     # Pipeline
     parser.add_argument("-i","--identifier_mapping", default="stdin", type=str,  help = "path/to/identifier_mapping.tsv[.gz] [id_protein]<tab>[id_genome] (No header) [Default: stdin]")
-    parser.add_argument("-a","--annotations",  type=str,  required=True, help = "path/to/annotations.tsv[.gz] Output from annotations.py. Multi-level header that contains (UniRef, sseqid)")
+    parser.add_argument("-a","--annotations",  type=str,  required=True, help = "path/to/annotations.tsv[.gz] Output from annotations.py. First column must be qseqid.  See --annotation_header_mode")
     parser.add_argument("-t","--taxonomy",  type=str, required=True,  help = "path/to/taxonomy.tsv[.gz] [id_genome]<tab>[classification] (No header).  Use output from `merge_taxonomy_classifications.py` with --no_header and --no_domain")
     parser.add_argument("-s","--sequences",  type=str, required=True,  help = "path/to/proteins.fasta[.gz]")
-    parser.add_argument("-o","--output", type=str, default="stdout", help = "path/to/humann_uniref_annotations.tsv[.gz] (veba_output/profiling/databases/) is recommended [Default: stdout]")
+    parser.add_argument("-o","--output_table", type=str, default="stdout", help = "path/to/humann_uniref_annotations.tsv[.gz] (veba_output/profiling/databases/) is recommended [Default: stdout]")
+    parser.add_argument("--output_fasta", type=str, help = "path/to/humann_uniref.fasta[.gz] (Recommended to add to veba_output/tmp/ and then build the diamond base in veba_output/profiling/databases/ )")
     parser.add_argument("--sep", default=";", help = "Separator for taxonomic levels [Default: ;]")
     # parser.add_argument("--mandatory_taxonomy_prefixes", help = "Comma-separated values for mandatory prefix levels. (e.g., 'c__,f__,g__,s__')")
     # parser.add_argument("--discarded_file",  help = "Proteins that have been discarded due to incomplete lineage")
     parser.add_argument("-g", "--no_append_genome_identifier", action="store_true", help = "Don't add genome to taxonomic lineage")
     parser.add_argument("--genome_prefix", type=str, default="t__", help = "Taxonomic level prefix for genome")
     parser.add_argument("--header", action="store_true", help = "Write header")
+    parser.add_argument("-m", "--annotation_header_mode", default="multilevel", choices={"header", "multilevel", "no_header"}, help = "If --annotation_header_mode == 'multiindex' header assumes that contains (UniRef, sseqid), --annotation_header_mode == 'header' assumes one-level header that contains `sseqid`, --annotation_header_mode == 'no_header' assumes there is no header [Default: multilevel]")
+    parser.add_argument("--sseqid_index", type=int, default=1, help = "Python indexing for sseqid position if --annotation_header_mode == no_header.  Assumes qseqid is index=0. [Default: 1]")
+
 
     # Options
     opts = parser.parse_args()
@@ -46,49 +50,102 @@ def main(args=None):
 
     if opts.identifier_mapping == "stdin":
         opts.identifier_mapping = sys.stdin 
+    else:
+        if opts.identifier_mapping.endswith(".gz"):
+            opts.identifier_mapping = gzip.open(opts.identifier_mapping, "rt")
+        else:
+            opts.identifier_mapping = open(opts.identifier_mapping, "r")
+
+    if opts.output_table == "stdout":
+        opts.output_table = sys.stdout 
+
+    if opts.output_fasta is not None:
+        if opts.output_fasta.endswith(".gz"):
+            opts.output_fasta = gzip.open(opts.output_fasta, "wt")
+        else:
+            opts.output_fasta = open(opts.output_fasta, "w")
 
-    if opts.output == "stdout":
-        opts.output = sys.stdout 
 
     # Proteins to genomes
-    protein_to_genome = pd.read_csv(opts.identifier_mapping, sep="\t", index_col=0, header=None).iloc[:,0]
-    A1 = set(protein_to_genome.index)
-    A2= set(protein_to_genome.values)
+    # protein_to_genome = pd.read_csv(opts.identifier_mapping, sep="\t", index_col=0, header=None).iloc[:,0]
+    protein_to_genome = dict()
+    for line in tqdm(opts.identifier_mapping,  desc="Getting genome for each protein: {}".format(opts.identifier_mapping), unit=" Proteins"):
+        line = line.strip()
+        if line:
+            id_protein, id_genome = line.split("\t")
+            protein_to_genome[id_protein] = id_genome
+
+    A1 = set(protein_to_genome.keys())
+    A2= set(protein_to_genome.values())
     print("--identifier_mapping", opts.identifier_mapping, file=sys.stderr)
     print(" * {} proteins".format(len(A1)), file=sys.stderr)
     print(" * {} genomes".format(len(A2)), file=sys.stderr)
 
     # Annotations
-    df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=[0,1])
-    assert "UniRef" in df_annotations.columns.get_level_values(0), "--annotations must have a 2 level header (i.e., Pandas MultiIndex with 2 levels) where the first level has 'UniRef' as created by `annotate.py`"
-    df_annotations = df_annotations["UniRef"]
-    protein_to_uniref = df_annotations["sseqid"].dropna()
+    if opts.annotation_header_mode == "no_header":
+        df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=None)
+        protein_to_uniref = df_annotations.iloc[:,opts.sseqid_index - 1].dropna()
+    else:
+        if opts.annotation_header_mode == "multilevel":
+            df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=[0,1])
+            assert "UniRef" in df_annotations.columns.get_level_values(0), "--annotations must have a 2 level header (i.e., Pandas MultiIndex with 2 levels) where the first level has 'UniRef' as created by `annotate.py`"
+            df_annotations = df_annotations["UniRef"]
+        else:
+            # Annotations
+            df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0)
+        protein_to_uniref = df_annotations["sseqid"].dropna()
+
     B1 = set(protein_to_uniref.index)
     B2 = set(protein_to_uniref.values)
 
     print("--annotations", opts.annotations, file=sys.stderr)
     print(" * {} proteins".format(len(B1)), file=sys.stderr)
     print(" * {} UniRef hits".format(len(B2)), file=sys.stderr)
+
     # Taxonomy
-    genome_to_taxonomy = pd.read_csv(opts.taxonomy, sep="\t", index_col=0, header=None).iloc[:,0]
-    C1 = set(genome_to_taxonomy.index)
-    C2 = set(genome_to_taxonomy.values)
+    # genome_to_taxonomy = pd.read_csv(opts.taxonomy, sep="\t", index_col=0, header=None).iloc[:,0]
+    if opts.taxonomy.endswith(".gz"):
+        f_taxonomy = gzip.open(opts.taxonomy, "rt") 
+    else:
+        f_taxonomy = open(opts.taxonomy, "r")
+
+    genome_to_taxonomy = dict()
+    for line in tqdm(f_taxonomy, desc="Getting taxonomy for each genome: {}".format(opts.taxonomy), unit=" Genomes"):
+        line = line.strip()
+        if line:
+            id_genome, taxonomy = line.split("\t")
+            genome_to_taxonomy[id_genome] = taxonomy
+    f_taxonomy.close()
+
+    C1 = set(genome_to_taxonomy.keys())
+    C2 = set(genome_to_taxonomy.values())
 
     print("--taxonomy", opts.taxonomy, file=sys.stderr)
     print(" * {} genomes".format(len(C1)), file=sys.stderr)
     print(" * {} taxonomic classifications".format(len(C2)), file=sys.stderr)
 
     if opts.sequences.endswith(".gz"):
-        f = gzip.open(opts.sequences, "rt") 
+        f_sequences = gzip.open(opts.sequences, "rt") 
     else:
-        f = open(opts.sequences, "r")
+        f_sequences = open(opts.sequences, "r")
 
-    protein_to_length = dict()
-    for header, seq in tqdm(SimpleFastaParser(f), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"):
-        id = header.split(" ")[0]
-        protein_to_length[id] = len(seq)
-    protein_to_length = pd.Series(protein_to_length)
-    D = set(protein_to_length.index)
+    if opts.output_fasta is None:
+        protein_to_length = dict()
+        for header, seq in tqdm(SimpleFastaParser(f_sequences), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"):
+            id = header.split(" ")[0]
+            if id in B1:
+                protein_to_length[id] = len(seq)
+    else:
+        protein_to_length = dict()
+        for header, seq in tqdm(SimpleFastaParser(f_sequences), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"):
+            id = header.split(" ")[0]
+            if id in B1:
+                protein_to_length[id] = len(seq)
+                print(">{}\n{}".format(id, seq), file=opts.output_fasta)
+        opts.output_fasta.close()
+    f_sequences.close()
+
+    D = set(protein_to_length.keys())
 
     # Checks
     assert A1 >= B1, "Not all proteins in --annotations are in --identifier_mapping."
@@ -98,22 +155,19 @@ def main(args=None):
     # Append genome to taxonomy
     if not opts.no_append_genome_identifier:
         tmp = dict()
-        for id_genome, taxonomy in genome_to_taxonomy.items():
+        for id_genome, taxonomy in tqdm(genome_to_taxonomy.items(), desc="Adding genome identifiers"):
             tmp[id_genome] = "{}{}{}{}".format(taxonomy, opts.sep, opts.genome_prefix, id_genome)
-        genome_to_taxonomy = pd.Series(tmp)
+        genome_to_taxonomy = tmp
 
 
 
     # id_protein, uniref_hit, len, lineage
     df_output = protein_to_uniref.to_frame("UniRef")
-    df_output["Length"] = protein_to_length[protein_to_uniref.index]
-    df_output["Taxonomy"] = protein_to_uniref.index.map(lambda id_protein: genome_to_taxonomy[protein_to_genome[id_protein]])
+    df_output["Length"] = df_output.index.map(lambda id_protein: protein_to_length[id_protein])
+    df_output["Taxonomy"] = df_output.index.map(lambda id_protein: genome_to_taxonomy[protein_to_genome[id_protein]])
     df_output.index.name = "id_protein"
 
-
-    # if opts.discarded_file:
-
-    df_output.to_csv(opts.output, sep="\t", header=bool(opts.header))
+    df_output.to_csv(opts.output_table, sep="\t", header=bool(opts.header))
 
 if __name__ == "__main__":
     main()
-Original file line number
+Diff line change
@@ -1,2 +1,2 @@
-.4.1
+.4.2
     VDB_v6