From 8502b7a81c976eeec466f5b6373894de342a75d0 Mon Sep 17 00:00:00 2001 From: "Josh L. Espinoza" Date: Thu, 21 Dec 2023 19:41:15 -0800 Subject: [PATCH] v1.4.2 --- CHANGELOG.md | 5 +- MODULE_RESOURCES.xlsx | Bin 10700 -> 10727 bytes README.md | 4 +- VERSION | 2 +- install/README.md | 2 +- src/classify-prokaryotic.py | 6 +- ...custom_humann_database_from_annotations.py | 118 +++++++++++++----- 7 files changed, 97 insertions(+), 40 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d24460e..315dd6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ ________________________________________________________________ #### Current Releases: -**Release v1.4.1 Highlights:** +**Release v1.4.2 Highlights:** * **`VEBA` Modules:** @@ -369,6 +369,7 @@ There was a problem importing veba_output/misc/reads_table.tsv: **Definitely:** +* Add option to `compile_custom_humann_database_from_annotations.py` to only output best hit of a UniRef identifier per genome. * Use `pigz` instead of `gzip` * Create a taxdump for `MicroEuk` * Reimplement `compile_eukaryotic_classifications.py` @@ -405,6 +406,8 @@ ________________________________________________________________
**Daily Change Log:** +* [2023.12.21] - `GTDB-Tk` changed name of archaea summary file so VEBA was not adding this to final classification. Fixed this in `classify-prokaryotic.py`. +* [2023.12.20] - Fixed files not being closed in `compile_custom_humann_database_from_annotations.py` and added options to use different annotation file formats (i.e., multilevel, header, and no header). * [2023.12.15] - Added `profile-taxonomic.py` module which uses `sylph` to build a sketch database for genomes and queries the genome database similar to `Kraken` for taxonomic abundance. * [2023.12.14] - Removed requirement to have `--estimated_assembly_size` for Flye per [Flye Issue #652](https://github.com/fenderglass/Flye/issues/652). * [2023.12.14] - Added `sylph` to `VEBA-profile_env` for abundance profiling of genomes. diff --git a/MODULE_RESOURCES.xlsx b/MODULE_RESOURCES.xlsx index bf99e33880301c230f8d33c8592a52f5bf875904..232034d069298894dbb6c2deddf6b832077f0c4f 100644 GIT binary patch delta 2086 zcmY+FX*kpi6vqD(gCUH_I!zke*sg6XDJEC;h!8{8K@>68vi)V>qKqYDu8OGadtBRS zV#XFF8J8@LLNN$Qk=wl=?$bS=&U4=P+j)P_J7l|PTer^z%j))YdBz3+3PcW=3>XK& z%f85)(9-zg(ifid#nzZ(a_LYi_k4j5t2-j;7Lk1Fn6^Wd<)`!Q4c1zT z+PU`l@zz)NNp|vszRk$Wos6>~_GJRC13^6s>KAt&+F;-A?Tu~cZ!F5-N4NpIVaMU5 z$9jn*jGWB18b?b722Rb@6JpzC3|bDuUXaLe8)5qEFwVTY7e7LJm<8^2PgbLK<$rkpHrIJW0G!gedCtnKA(*TP-YvI?NBof`h$wE0hz zkx>uT??m@cE01%^$(sH$HGudr(|fytOLWzlB=-np!r*PC=l#h8h+;T^%&H*gR47+l3a24Wtgxkt-Q#PLWXH4k#Hu6rsikh%iZ;Ge zLt^Mlg?F&d127@sC7x(7N#GIERV%`A!F({IKAj~Igp84~v{d{eLK>B!MQ1;YZ=Spq z7dpYdKfw4?Qs9w9$#%Yj+ZgY@H*c>)=r=Me?**{iOCl(IZZL>wl7&{ACTMx3-TTNC z>a@|*SE)a^A4MK;N2Pw~VNws)qCWZZS=WBWuaAYek`sEkm51(#mwJbK)=c40?r9#q(MG+jq*AHz;qEzzDA^HJYl;qv=BoWSF$`wDBX`XX+?%V%9CkJFCqxh^$Yo)|RRSv?Z$(3@JOxlaz3_7(l4{xhZB)blC~6 zlka2FPeF>?h)%)Bi>pGx?VgK^C9HsWBc6aW;c?ek{c0y`iiI6glJI>q_7Ai(yr#1Q z4~pgy9V!@lQeQtcDS3Wz*)#INsTo<>LQX4S? zjD6&*-vrs28c1t;6y>^kHg43?G50zDU5W}zUbI%Wfm5-ka2=j0XId*U4THtt57-)m z#cpvyOX6N3&o#26WFerN1yxn3cNW@h*x%?v3QbRJVLMnc5MozM4?NK}l^$>;$5$KM z>DN9!5WTK;a`8v-3zXFj9dEdUo7hr> zdzZ~*4-5YFtaO~Cs+MAO^C3b|)+9V4kCiW$UNI*Vy=c7PN|NH6B51Rd*((;f9BlrTF$9fEN7(mJAZ|4jR#};tX$<@ z$t+fNr7?c0rBQ1YpSZ`~tv`+3wr84$wI7K~KgTVr6jU2P_;jvpO9Y$2YMS!8jtaY4 z)wap+!K~z77&m6oV^+9Ux(3z(Iyw<=n3wQ5$8|Q>!frINTHhK_YJN*Qryq7WmTb?ip+B zUQoX*?G4jeH zLLjM6yc*qgp(c|L8DrLO?#HC#^OX3-tLE^9spOMO6YfiB+aB5zdUJznwv^WsrTDou zD1o!WdDgfv?m`phssWK7Mgj>p3zT*%;ktF1%?hIL(TBpph7-&9DM{$=0+r_?kQi^$ zW4%JH4@VR!o?L~h>4=1Lu$lUkrLOVfC}C|jH@hT@v(loHZ$?mX)gtZR-)~aY-PVvF z2KNO04!-JaQghpCCEjP%6Vh3?qIz5!w|R`#TNs-qbj73hQ$d2$-V;gIJFA4%b06}8 zqtG*$dlJHL4Nmhps63h=ZaIj9*X;!N_PgGL08v$r@k=EOpsgS?M0d06&32+=@9oVbHlfG84C@^TXYLiF?90fzuU;8!QGNA!}HmiR9YKf53PKOH2N M%1d$B$o+Ht8~Og!k^lez delta 2050 zcmV+d2>tixQ_NGa{Rjy?*$_Iq0{{RdlLiSTe~32&6X$9hXfu=CWE@}8e#*!o8(jcV zNjP>h_usq1#z`)nj+?8;mMm#`^z81l%lW5mT^SoHB;(E4#Ami?&?e^vYu01)%ObNv z(~ycZ1*v#L$L5Yo^V7x8Kb=2u@wnvt(Exx(j?Gf3HbzM1C9R1(<1KCAWW_~I6fDIW zf5}!*Qpl20RaeNd?Euw;HRc{67N=nF)r#db;dxinM(sfat%w5pvSh6s(CYjYxSEK^ zuC;Prw*awZ6;rz&)HLcmzFaq4kYxpKZ?SJ|1$+beVH*Z#xCxHlT~)JOaLHHd44~1T z^gI8tjWB)_g6;bloQC8fK{rh2@W@I8e;;HO9I=8IFl_%Y8U|_gT8SZeA4Kzy+#K`b ze8nnyx0kL#+V(T4b*3uQsEAZ)!4xgV<_MPjiN4toT|4bq1*aH0zzba*<$NQIg04tc zsRammfCs}Mh#X&Af-okPqM{*+&Um9hhX=oYRC7J{tmL3@V^061=7rJ6gLBtNtD&7^h3Z%QH=vU6y6-@-&P;0O}`my;44TUuOS7`y|%^MG(=w z*Zcq9M;DwI@u2S;B@Fm~nOs4Re{abKG7h>b4h`W_r__CDauMT)=TS5YLN}XPS!laf zhCSDcaF|-I69qVo0^jyV-@q9W#5pIODi31P&|}jB<-9w(Cfnfz+i}N=7aGs@;bZCf zyJ0x{rXAL8^N!Ic`64^LG`9DwDfpAVzy_yw!_tLbX&O)c;XNxLGEsrni8%YYvC?8(TE zJj+g9-}1u9u|m&vteKZMf2lu8lXU9q{OA`<{C~9BzKr7G&81nEM5u)z`6ImV=5#eB z5(Kq(2>8AxJN3henV$$UQfW0C$oT!H>Epr}u{9($RhK_(Y}1Cx;h6tnCMPYMafo1ziL1pokR zvqTq+0e>?b@xNXKmnaZk6h5xcYEwVa-wc^?3nN|qk)>OrB>p)i%=xBr-}UY*$i>p( zF0%cIjR*xC@U%rpvkxLB0e@Xf!!Q_y?*;!u$+cbj zF=d3d13z{l4g@EPcOiMV1)C3&Xg7boX}i_RyznwP=Y4X{ON#kU*3l|Y@RUa;=a#UXO zo7>d_Rauhdahk>Hbd@C}%}AcTwE=$3w_*DWd5H0EVs=TUvk~zKs0w`YL8@;6laT`y z3Bq1k2#^5)02{NGBq9NSZ`&{ohVKLV9}L|aIc-r4fvjRkSM;!A0g`rSPNrj_vLuMC zk?hw`w$aqtcG`^}FJ3;PsO}D(D|$?1gRjJLQHlb+3AOe6N<8l#E&bc&x+` zN!;m+Pim7w#Kab<$QGU|(K5%Clw?}$fEKy!bEgTZ1I}sM%g{7`*5G|GZykD;S7muC z@xbV7tS{rqP)v@ip8wah4rbt^m)$XDV6DEy=&S+GuhdW0q(GtJ;t7oP9H=e69t=7q z=Q5Q!w`ksy<)bc@oTqAQz~MTN&~=dr!f%TbVnbKzhxf#k+H;?A-| z`rd3H@jr-{bK;R<;+=4|1K%`r&Nn_ePx`}q9l8kqm>Z|)oAs~s9CzU!7~j`B|olO{|1vwC)f!+ z*$_Iq0{{RdlPoAs0oIdyC^`YAleQ=(8^)WW5yb@n0Bain02lxO00000000000000b zllLe^0U(n-DJL5o@U%rp0RRAP0ssIJ0000000000000000P&N5DI^=hURel`0RR9S g0{{RJ0000000000000000C1DODI*3pCjbBd08bImGynhq diff --git a/README.md b/README.md index 3d2973b..17e5240 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ ___________________________________________________________________ ### Announcements -* **What's new in `VEBA v1.4.1`?** +* **What's new in `VEBA v1.4.2`?** * **`VEBA` Modules:** @@ -67,7 +67,7 @@ ___________________________________________________________________ ### Installation and databases -**Current Stable Version:** [`v1.4.1`](https://github.com/jolespin/veba/releases/tag/v1.4.1) +**Current Stable Version:** [`v1.4.2`](https://github.com/jolespin/veba/releases/tag/v1.4.2) **Current Database Version:** `VDB_v6` diff --git a/VERSION b/VERSION index 35cf735..8fc3b86 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -1.4.1 +1.4.2 VDB_v6 diff --git a/install/README.md b/install/README.md index 7f796d0..67414bf 100644 --- a/install/README.md +++ b/install/README.md @@ -85,7 +85,7 @@ The `VEBA` installation is going to configure some `conda` environments for you ``` # For stable version, download and decompress the tarball: -VERSION="1.4.1" +VERSION="1.4.2" wget https://github.com/jolespin/veba/archive/refs/tags/v${VERSION}.tar.gz tar -xvf v${VERSION}.tar.gz && mv veba-${VERSION} veba diff --git a/src/classify-prokaryotic.py b/src/classify-prokaryotic.py index e6f1d47..e4b6428 100755 --- a/src/classify-prokaryotic.py +++ b/src/classify-prokaryotic.py @@ -15,7 +15,7 @@ pd.options.display.max_colwidth = 100 # from tqdm import tqdm __program__ = os.path.split(sys.argv[0])[-1] -__version__ = "2023.11.30" +__version__ = "2023.12.21" # GTDB-Tk def get_gtdbtk_cmd( input_filepaths, output_filepaths, output_directory, directories, opts): @@ -93,8 +93,8 @@ def get_gtdbtk_cmd( input_filepaths, output_filepaths, output_directory, directo os.environ["concatenate_dataframes.py"], "--axis 0", "--allow_empty_or_missing_files", - os.path.join(output_directory, "classify", "gtdbtk.ar122.summary.tsv"), - os.path.join(output_directory, "classify", "gtdbtk.bac120.summary.tsv"), + os.path.join(output_directory, "classify", "gtdbtk.ar*.summary.tsv"), + os.path.join(output_directory, "classify", "gtdbtk.bac*.summary.tsv"), ">", os.path.join(directories["output"], "taxonomy.tsv"), diff --git a/src/scripts/compile_custom_humann_database_from_annotations.py b/src/scripts/compile_custom_humann_database_from_annotations.py index 6604413..dffbcf8 100755 --- a/src/scripts/compile_custom_humann_database_from_annotations.py +++ b/src/scripts/compile_custom_humann_database_from_annotations.py @@ -11,7 +11,7 @@ pd.options.display.max_colwidth = 100 # from tqdm import tqdm __program__ = os.path.split(sys.argv[0])[-1] -__version__ = "2023.12.15" +__version__ = "2023.12.20" def main(args=None): @@ -21,23 +21,27 @@ def main(args=None): # Path info description = """ Running: {} v{} via Python v{} | {}""".format(__program__, __version__, sys.version.split(" ")[0], sys.executable) - usage = "{} -i -a -t -o ".format(__program__) + usage = "{} -i -a -t -o ".format(__program__) epilog = "Copyright 2021 Josh L. Espinoza (jespinoz@jcvi.org)" # Parser parser = argparse.ArgumentParser(description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter) # Pipeline parser.add_argument("-i","--identifier_mapping", default="stdin", type=str, help = "path/to/identifier_mapping.tsv[.gz] [id_protein][id_genome] (No header) [Default: stdin]") - parser.add_argument("-a","--annotations", type=str, required=True, help = "path/to/annotations.tsv[.gz] Output from annotations.py. Multi-level header that contains (UniRef, sseqid)") + parser.add_argument("-a","--annotations", type=str, required=True, help = "path/to/annotations.tsv[.gz] Output from annotations.py. First column must be qseqid. See --annotation_header_mode") parser.add_argument("-t","--taxonomy", type=str, required=True, help = "path/to/taxonomy.tsv[.gz] [id_genome][classification] (No header). Use output from `merge_taxonomy_classifications.py` with --no_header and --no_domain") parser.add_argument("-s","--sequences", type=str, required=True, help = "path/to/proteins.fasta[.gz]") - parser.add_argument("-o","--output", type=str, default="stdout", help = "path/to/humann_uniref_annotations.tsv[.gz] (veba_output/profiling/databases/) is recommended [Default: stdout]") + parser.add_argument("-o","--output_table", type=str, default="stdout", help = "path/to/humann_uniref_annotations.tsv[.gz] (veba_output/profiling/databases/) is recommended [Default: stdout]") + parser.add_argument("--output_fasta", type=str, help = "path/to/humann_uniref.fasta[.gz] (Recommended to add to veba_output/tmp/ and then build the diamond base in veba_output/profiling/databases/ )") parser.add_argument("--sep", default=";", help = "Separator for taxonomic levels [Default: ;]") # parser.add_argument("--mandatory_taxonomy_prefixes", help = "Comma-separated values for mandatory prefix levels. (e.g., 'c__,f__,g__,s__')") # parser.add_argument("--discarded_file", help = "Proteins that have been discarded due to incomplete lineage") parser.add_argument("-g", "--no_append_genome_identifier", action="store_true", help = "Don't add genome to taxonomic lineage") parser.add_argument("--genome_prefix", type=str, default="t__", help = "Taxonomic level prefix for genome") parser.add_argument("--header", action="store_true", help = "Write header") + parser.add_argument("-m", "--annotation_header_mode", default="multilevel", choices={"header", "multilevel", "no_header"}, help = "If --annotation_header_mode == 'multiindex' header assumes that contains (UniRef, sseqid), --annotation_header_mode == 'header' assumes one-level header that contains `sseqid`, --annotation_header_mode == 'no_header' assumes there is no header [Default: multilevel]") + parser.add_argument("--sseqid_index", type=int, default=1, help = "Python indexing for sseqid position if --annotation_header_mode == no_header. Assumes qseqid is index=0. [Default: 1]") + # Options opts = parser.parse_args() @@ -46,49 +50,102 @@ def main(args=None): if opts.identifier_mapping == "stdin": opts.identifier_mapping = sys.stdin + else: + if opts.identifier_mapping.endswith(".gz"): + opts.identifier_mapping = gzip.open(opts.identifier_mapping, "rt") + else: + opts.identifier_mapping = open(opts.identifier_mapping, "r") + + if opts.output_table == "stdout": + opts.output_table = sys.stdout + + if opts.output_fasta is not None: + if opts.output_fasta.endswith(".gz"): + opts.output_fasta = gzip.open(opts.output_fasta, "wt") + else: + opts.output_fasta = open(opts.output_fasta, "w") - if opts.output == "stdout": - opts.output = sys.stdout # Proteins to genomes - protein_to_genome = pd.read_csv(opts.identifier_mapping, sep="\t", index_col=0, header=None).iloc[:,0] - A1 = set(protein_to_genome.index) - A2= set(protein_to_genome.values) + # protein_to_genome = pd.read_csv(opts.identifier_mapping, sep="\t", index_col=0, header=None).iloc[:,0] + protein_to_genome = dict() + for line in tqdm(opts.identifier_mapping, desc="Getting genome for each protein: {}".format(opts.identifier_mapping), unit=" Proteins"): + line = line.strip() + if line: + id_protein, id_genome = line.split("\t") + protein_to_genome[id_protein] = id_genome + + A1 = set(protein_to_genome.keys()) + A2= set(protein_to_genome.values()) print("--identifier_mapping", opts.identifier_mapping, file=sys.stderr) print(" * {} proteins".format(len(A1)), file=sys.stderr) print(" * {} genomes".format(len(A2)), file=sys.stderr) # Annotations - df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=[0,1]) - assert "UniRef" in df_annotations.columns.get_level_values(0), "--annotations must have a 2 level header (i.e., Pandas MultiIndex with 2 levels) where the first level has 'UniRef' as created by `annotate.py`" - df_annotations = df_annotations["UniRef"] - protein_to_uniref = df_annotations["sseqid"].dropna() + if opts.annotation_header_mode == "no_header": + df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=None) + protein_to_uniref = df_annotations.iloc[:,opts.sseqid_index - 1].dropna() + else: + if opts.annotation_header_mode == "multilevel": + df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=[0,1]) + assert "UniRef" in df_annotations.columns.get_level_values(0), "--annotations must have a 2 level header (i.e., Pandas MultiIndex with 2 levels) where the first level has 'UniRef' as created by `annotate.py`" + df_annotations = df_annotations["UniRef"] + else: + # Annotations + df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0) + protein_to_uniref = df_annotations["sseqid"].dropna() + B1 = set(protein_to_uniref.index) B2 = set(protein_to_uniref.values) print("--annotations", opts.annotations, file=sys.stderr) print(" * {} proteins".format(len(B1)), file=sys.stderr) print(" * {} UniRef hits".format(len(B2)), file=sys.stderr) + # Taxonomy - genome_to_taxonomy = pd.read_csv(opts.taxonomy, sep="\t", index_col=0, header=None).iloc[:,0] - C1 = set(genome_to_taxonomy.index) - C2 = set(genome_to_taxonomy.values) + # genome_to_taxonomy = pd.read_csv(opts.taxonomy, sep="\t", index_col=0, header=None).iloc[:,0] + if opts.taxonomy.endswith(".gz"): + f_taxonomy = gzip.open(opts.taxonomy, "rt") + else: + f_taxonomy = open(opts.taxonomy, "r") + + genome_to_taxonomy = dict() + for line in tqdm(f_taxonomy, desc="Getting taxonomy for each genome: {}".format(opts.taxonomy), unit=" Genomes"): + line = line.strip() + if line: + id_genome, taxonomy = line.split("\t") + genome_to_taxonomy[id_genome] = taxonomy + f_taxonomy.close() + + C1 = set(genome_to_taxonomy.keys()) + C2 = set(genome_to_taxonomy.values()) print("--taxonomy", opts.taxonomy, file=sys.stderr) print(" * {} genomes".format(len(C1)), file=sys.stderr) print(" * {} taxonomic classifications".format(len(C2)), file=sys.stderr) if opts.sequences.endswith(".gz"): - f = gzip.open(opts.sequences, "rt") + f_sequences = gzip.open(opts.sequences, "rt") else: - f = open(opts.sequences, "r") + f_sequences = open(opts.sequences, "r") - protein_to_length = dict() - for header, seq in tqdm(SimpleFastaParser(f), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"): - id = header.split(" ")[0] - protein_to_length[id] = len(seq) - protein_to_length = pd.Series(protein_to_length) - D = set(protein_to_length.index) + if opts.output_fasta is None: + protein_to_length = dict() + for header, seq in tqdm(SimpleFastaParser(f_sequences), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"): + id = header.split(" ")[0] + if id in B1: + protein_to_length[id] = len(seq) + else: + protein_to_length = dict() + for header, seq in tqdm(SimpleFastaParser(f_sequences), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"): + id = header.split(" ")[0] + if id in B1: + protein_to_length[id] = len(seq) + print(">{}\n{}".format(id, seq), file=opts.output_fasta) + opts.output_fasta.close() + f_sequences.close() + + D = set(protein_to_length.keys()) # Checks assert A1 >= B1, "Not all proteins in --annotations are in --identifier_mapping." @@ -98,22 +155,19 @@ def main(args=None): # Append genome to taxonomy if not opts.no_append_genome_identifier: tmp = dict() - for id_genome, taxonomy in genome_to_taxonomy.items(): + for id_genome, taxonomy in tqdm(genome_to_taxonomy.items(), desc="Adding genome identifiers"): tmp[id_genome] = "{}{}{}{}".format(taxonomy, opts.sep, opts.genome_prefix, id_genome) - genome_to_taxonomy = pd.Series(tmp) + genome_to_taxonomy = tmp # id_protein, uniref_hit, len, lineage df_output = protein_to_uniref.to_frame("UniRef") - df_output["Length"] = protein_to_length[protein_to_uniref.index] - df_output["Taxonomy"] = protein_to_uniref.index.map(lambda id_protein: genome_to_taxonomy[protein_to_genome[id_protein]]) + df_output["Length"] = df_output.index.map(lambda id_protein: protein_to_length[id_protein]) + df_output["Taxonomy"] = df_output.index.map(lambda id_protein: genome_to_taxonomy[protein_to_genome[id_protein]]) df_output.index.name = "id_protein" - - # if opts.discarded_file: - - df_output.to_csv(opts.output, sep="\t", header=bool(opts.header)) + df_output.to_csv(opts.output_table, sep="\t", header=bool(opts.header)) if __name__ == "__main__": main()