Skip to content

Commit

Permalink
v1.4.2
Browse files Browse the repository at this point in the history
  • Loading branch information
jolespin committed Dec 22, 2023
1 parent 70161aa commit 8502b7a
Show file tree
Hide file tree
Showing 7 changed files with 97 additions and 40 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ ________________________________________________________________

#### Current Releases:

**Release v1.4.1 Highlights:**
**Release v1.4.2 Highlights:**

* **`VEBA` Modules:**

Expand Down Expand Up @@ -369,6 +369,7 @@ There was a problem importing veba_output/misc/reads_table.tsv:

**Definitely:**

* Add option to `compile_custom_humann_database_from_annotations.py` to only output best hit of a UniRef identifier per genome.
* Use `pigz` instead of `gzip`
* Create a taxdump for `MicroEuk`
* Reimplement `compile_eukaryotic_classifications.py`
Expand Down Expand Up @@ -405,6 +406,8 @@ ________________________________________________________________
<details>
<summary>**Daily Change Log:**</summary>

* [2023.12.21] - `GTDB-Tk` changed name of archaea summary file so VEBA was not adding this to final classification. Fixed this in `classify-prokaryotic.py`.
* [2023.12.20] - Fixed files not being closed in `compile_custom_humann_database_from_annotations.py` and added options to use different annotation file formats (i.e., multilevel, header, and no header).
* [2023.12.15] - Added `profile-taxonomic.py` module which uses `sylph` to build a sketch database for genomes and queries the genome database similar to `Kraken` for taxonomic abundance.
* [2023.12.14] - Removed requirement to have `--estimated_assembly_size` for Flye per [Flye Issue #652](https://github.com/fenderglass/Flye/issues/652).
* [2023.12.14] - Added `sylph` to `VEBA-profile_env` for abundance profiling of genomes.
Expand Down
Binary file modified MODULE_RESOURCES.xlsx
Binary file not shown.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ ___________________________________________________________________

### Announcements

* **What's new in `VEBA v1.4.1`?**
* **What's new in `VEBA v1.4.2`?**

* **`VEBA` Modules:**

Expand All @@ -67,7 +67,7 @@ ___________________________________________________________________

### Installation and databases

**Current Stable Version:** [`v1.4.1`](https://github.com/jolespin/veba/releases/tag/v1.4.1)
**Current Stable Version:** [`v1.4.2`](https://github.com/jolespin/veba/releases/tag/v1.4.2)

**Current Database Version:** `VDB_v6`

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
1.4.1
1.4.2
VDB_v6
2 changes: 1 addition & 1 deletion install/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ The `VEBA` installation is going to configure some `conda` environments for you
```
# For stable version, download and decompress the tarball:
VERSION="1.4.1"
VERSION="1.4.2"
wget https://github.com/jolespin/veba/archive/refs/tags/v${VERSION}.tar.gz
tar -xvf v${VERSION}.tar.gz && mv veba-${VERSION} veba
Expand Down
6 changes: 3 additions & 3 deletions src/classify-prokaryotic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
pd.options.display.max_colwidth = 100
# from tqdm import tqdm
__program__ = os.path.split(sys.argv[0])[-1]
__version__ = "2023.11.30"
__version__ = "2023.12.21"

# GTDB-Tk
def get_gtdbtk_cmd( input_filepaths, output_filepaths, output_directory, directories, opts):
Expand Down Expand Up @@ -93,8 +93,8 @@ def get_gtdbtk_cmd( input_filepaths, output_filepaths, output_directory, directo
os.environ["concatenate_dataframes.py"],
"--axis 0",
"--allow_empty_or_missing_files",
os.path.join(output_directory, "classify", "gtdbtk.ar122.summary.tsv"),
os.path.join(output_directory, "classify", "gtdbtk.bac120.summary.tsv"),
os.path.join(output_directory, "classify", "gtdbtk.ar*.summary.tsv"),
os.path.join(output_directory, "classify", "gtdbtk.bac*.summary.tsv"),
">",
os.path.join(directories["output"], "taxonomy.tsv"),

Expand Down
118 changes: 86 additions & 32 deletions src/scripts/compile_custom_humann_database_from_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
pd.options.display.max_colwidth = 100
# from tqdm import tqdm
__program__ = os.path.split(sys.argv[0])[-1]
__version__ = "2023.12.15"
__version__ = "2023.12.20"


def main(args=None):
Expand All @@ -21,23 +21,27 @@ def main(args=None):
# Path info
description = """
Running: {} v{} via Python v{} | {}""".format(__program__, __version__, sys.version.split(" ")[0], sys.executable)
usage = "{} -i <identifier_mapping> -a <annotations> -t <taxnomy> -o <output_table>".format(__program__)
usage = "{} -i <identifier_mapping> -a <annotations> -t <taxonomy> -o <output_table>".format(__program__)
epilog = "Copyright 2021 Josh L. Espinoza ([email protected])"

# Parser
parser = argparse.ArgumentParser(description=description, usage=usage, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter)
# Pipeline
parser.add_argument("-i","--identifier_mapping", default="stdin", type=str, help = "path/to/identifier_mapping.tsv[.gz] [id_protein]<tab>[id_genome] (No header) [Default: stdin]")
parser.add_argument("-a","--annotations", type=str, required=True, help = "path/to/annotations.tsv[.gz] Output from annotations.py. Multi-level header that contains (UniRef, sseqid)")
parser.add_argument("-a","--annotations", type=str, required=True, help = "path/to/annotations.tsv[.gz] Output from annotations.py. First column must be qseqid. See --annotation_header_mode")
parser.add_argument("-t","--taxonomy", type=str, required=True, help = "path/to/taxonomy.tsv[.gz] [id_genome]<tab>[classification] (No header). Use output from `merge_taxonomy_classifications.py` with --no_header and --no_domain")
parser.add_argument("-s","--sequences", type=str, required=True, help = "path/to/proteins.fasta[.gz]")
parser.add_argument("-o","--output", type=str, default="stdout", help = "path/to/humann_uniref_annotations.tsv[.gz] (veba_output/profiling/databases/) is recommended [Default: stdout]")
parser.add_argument("-o","--output_table", type=str, default="stdout", help = "path/to/humann_uniref_annotations.tsv[.gz] (veba_output/profiling/databases/) is recommended [Default: stdout]")
parser.add_argument("--output_fasta", type=str, help = "path/to/humann_uniref.fasta[.gz] (Recommended to add to veba_output/tmp/ and then build the diamond base in veba_output/profiling/databases/ )")
parser.add_argument("--sep", default=";", help = "Separator for taxonomic levels [Default: ;]")
# parser.add_argument("--mandatory_taxonomy_prefixes", help = "Comma-separated values for mandatory prefix levels. (e.g., 'c__,f__,g__,s__')")
# parser.add_argument("--discarded_file", help = "Proteins that have been discarded due to incomplete lineage")
parser.add_argument("-g", "--no_append_genome_identifier", action="store_true", help = "Don't add genome to taxonomic lineage")
parser.add_argument("--genome_prefix", type=str, default="t__", help = "Taxonomic level prefix for genome")
parser.add_argument("--header", action="store_true", help = "Write header")
parser.add_argument("-m", "--annotation_header_mode", default="multilevel", choices={"header", "multilevel", "no_header"}, help = "If --annotation_header_mode == 'multiindex' header assumes that contains (UniRef, sseqid), --annotation_header_mode == 'header' assumes one-level header that contains `sseqid`, --annotation_header_mode == 'no_header' assumes there is no header [Default: multilevel]")
parser.add_argument("--sseqid_index", type=int, default=1, help = "Python indexing for sseqid position if --annotation_header_mode == no_header. Assumes qseqid is index=0. [Default: 1]")


# Options
opts = parser.parse_args()
Expand All @@ -46,49 +50,102 @@ def main(args=None):

if opts.identifier_mapping == "stdin":
opts.identifier_mapping = sys.stdin
else:
if opts.identifier_mapping.endswith(".gz"):
opts.identifier_mapping = gzip.open(opts.identifier_mapping, "rt")
else:
opts.identifier_mapping = open(opts.identifier_mapping, "r")

if opts.output_table == "stdout":
opts.output_table = sys.stdout

if opts.output_fasta is not None:
if opts.output_fasta.endswith(".gz"):
opts.output_fasta = gzip.open(opts.output_fasta, "wt")
else:
opts.output_fasta = open(opts.output_fasta, "w")

if opts.output == "stdout":
opts.output = sys.stdout

# Proteins to genomes
protein_to_genome = pd.read_csv(opts.identifier_mapping, sep="\t", index_col=0, header=None).iloc[:,0]
A1 = set(protein_to_genome.index)
A2= set(protein_to_genome.values)
# protein_to_genome = pd.read_csv(opts.identifier_mapping, sep="\t", index_col=0, header=None).iloc[:,0]
protein_to_genome = dict()
for line in tqdm(opts.identifier_mapping, desc="Getting genome for each protein: {}".format(opts.identifier_mapping), unit=" Proteins"):
line = line.strip()
if line:
id_protein, id_genome = line.split("\t")
protein_to_genome[id_protein] = id_genome

A1 = set(protein_to_genome.keys())
A2= set(protein_to_genome.values())
print("--identifier_mapping", opts.identifier_mapping, file=sys.stderr)
print(" * {} proteins".format(len(A1)), file=sys.stderr)
print(" * {} genomes".format(len(A2)), file=sys.stderr)

# Annotations
df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=[0,1])
assert "UniRef" in df_annotations.columns.get_level_values(0), "--annotations must have a 2 level header (i.e., Pandas MultiIndex with 2 levels) where the first level has 'UniRef' as created by `annotate.py`"
df_annotations = df_annotations["UniRef"]
protein_to_uniref = df_annotations["sseqid"].dropna()
if opts.annotation_header_mode == "no_header":
df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=None)
protein_to_uniref = df_annotations.iloc[:,opts.sseqid_index - 1].dropna()
else:
if opts.annotation_header_mode == "multilevel":
df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0, header=[0,1])
assert "UniRef" in df_annotations.columns.get_level_values(0), "--annotations must have a 2 level header (i.e., Pandas MultiIndex with 2 levels) where the first level has 'UniRef' as created by `annotate.py`"
df_annotations = df_annotations["UniRef"]
else:
# Annotations
df_annotations = pd.read_csv(opts.annotations, sep="\t", index_col=0)
protein_to_uniref = df_annotations["sseqid"].dropna()

B1 = set(protein_to_uniref.index)
B2 = set(protein_to_uniref.values)

print("--annotations", opts.annotations, file=sys.stderr)
print(" * {} proteins".format(len(B1)), file=sys.stderr)
print(" * {} UniRef hits".format(len(B2)), file=sys.stderr)

# Taxonomy
genome_to_taxonomy = pd.read_csv(opts.taxonomy, sep="\t", index_col=0, header=None).iloc[:,0]
C1 = set(genome_to_taxonomy.index)
C2 = set(genome_to_taxonomy.values)
# genome_to_taxonomy = pd.read_csv(opts.taxonomy, sep="\t", index_col=0, header=None).iloc[:,0]
if opts.taxonomy.endswith(".gz"):
f_taxonomy = gzip.open(opts.taxonomy, "rt")
else:
f_taxonomy = open(opts.taxonomy, "r")

genome_to_taxonomy = dict()
for line in tqdm(f_taxonomy, desc="Getting taxonomy for each genome: {}".format(opts.taxonomy), unit=" Genomes"):
line = line.strip()
if line:
id_genome, taxonomy = line.split("\t")
genome_to_taxonomy[id_genome] = taxonomy
f_taxonomy.close()

C1 = set(genome_to_taxonomy.keys())
C2 = set(genome_to_taxonomy.values())

print("--taxonomy", opts.taxonomy, file=sys.stderr)
print(" * {} genomes".format(len(C1)), file=sys.stderr)
print(" * {} taxonomic classifications".format(len(C2)), file=sys.stderr)

if opts.sequences.endswith(".gz"):
f = gzip.open(opts.sequences, "rt")
f_sequences = gzip.open(opts.sequences, "rt")
else:
f = open(opts.sequences, "r")
f_sequences = open(opts.sequences, "r")

protein_to_length = dict()
for header, seq in tqdm(SimpleFastaParser(f), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"):
id = header.split(" ")[0]
protein_to_length[id] = len(seq)
protein_to_length = pd.Series(protein_to_length)
D = set(protein_to_length.index)
if opts.output_fasta is None:
protein_to_length = dict()
for header, seq in tqdm(SimpleFastaParser(f_sequences), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"):
id = header.split(" ")[0]
if id in B1:
protein_to_length[id] = len(seq)
else:
protein_to_length = dict()
for header, seq in tqdm(SimpleFastaParser(f_sequences), "Calculating length of proteins: {}".format(opts.sequences), unit=" Proteins"):
id = header.split(" ")[0]
if id in B1:
protein_to_length[id] = len(seq)
print(">{}\n{}".format(id, seq), file=opts.output_fasta)
opts.output_fasta.close()
f_sequences.close()

D = set(protein_to_length.keys())

# Checks
assert A1 >= B1, "Not all proteins in --annotations are in --identifier_mapping."
Expand All @@ -98,22 +155,19 @@ def main(args=None):
# Append genome to taxonomy
if not opts.no_append_genome_identifier:
tmp = dict()
for id_genome, taxonomy in genome_to_taxonomy.items():
for id_genome, taxonomy in tqdm(genome_to_taxonomy.items(), desc="Adding genome identifiers"):
tmp[id_genome] = "{}{}{}{}".format(taxonomy, opts.sep, opts.genome_prefix, id_genome)
genome_to_taxonomy = pd.Series(tmp)
genome_to_taxonomy = tmp



# id_protein, uniref_hit, len, lineage
df_output = protein_to_uniref.to_frame("UniRef")
df_output["Length"] = protein_to_length[protein_to_uniref.index]
df_output["Taxonomy"] = protein_to_uniref.index.map(lambda id_protein: genome_to_taxonomy[protein_to_genome[id_protein]])
df_output["Length"] = df_output.index.map(lambda id_protein: protein_to_length[id_protein])
df_output["Taxonomy"] = df_output.index.map(lambda id_protein: genome_to_taxonomy[protein_to_genome[id_protein]])
df_output.index.name = "id_protein"


# if opts.discarded_file:

df_output.to_csv(opts.output, sep="\t", header=bool(opts.header))
df_output.to_csv(opts.output_table, sep="\t", header=bool(opts.header))

if __name__ == "__main__":
main()

0 comments on commit 8502b7a

Please sign in to comment.