Skip to content

Commit

Permalink
Merge pull request #97 from EBI-Metagenomics/dev
Browse files Browse the repository at this point in the history
Bugfix release - PR #94
  • Loading branch information
mberacochea authored Apr 28, 2023
2 parents ff0992b + a7bee4b commit aaf02d6
Show file tree
Hide file tree
Showing 44 changed files with 13,228 additions and 321 deletions.
29 changes: 29 additions & 0 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
name: Python unit tests

on:
push:
branches: [ "master" ]
pull_request:
branches: [ "master", "dev" ]

permissions:
contents: read

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
pip install -r requirements-test.txt
- name: Unit tests
run: |
# TODO, improve the pythonpath handling
PYTHONPATH="$PYTHONPATH:bin" python -m unittest discover tests
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,21 @@ tower {

You can also directly enter your access token here instead of generating the above-mentioned environment variable.


### GFF output files

The outputs generated from viral prediction tools, ViPhOG annotation, taxonomy assign, and CheckV quality are integrated and summarized in a validated gff file. You can find such output on the 08-final/gff/ folder.

The labels used in the Type column of the gff file corresponds to the following nomenclature according to the [Sequence Ontology resource](http://www.sequenceontology.org/browser/current_svn/term/SO:0000001):

| Type in gff file | Sequence ontology ID |
| ------------- | ------------- |
| viral_sequence | [SO:0001041](http://www.sequenceontology.org/browser/current_svn/term/SO:0001041) |
| prophage | [SO:0001006](http://www.sequenceontology.org/browser/current_svn/term/SO:0001006) |
| CDS | [SO:0000316](http://www.sequenceontology.org/browser/current_svn/term/SO:0000316) |

Note that CDS are reported only when a ViPhOG match has been found.

# Common Workflow Language
VIRify was implemented in [Common Workflow Language (CWL)](https://www.commonwl.org/).

Expand Down
207 changes: 151 additions & 56 deletions bin/parse_viral_pred.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,49 @@ def to_tsv(self):
return [self.seq_id, self.category, self.circular, *self.prange]

def get_seq_record(self):
"""Get the SeqRecord with the category and prange encoded in the header.
"""
"""Get the SeqRecord with the category and prange encoded in the header."""
seq_record = copy(self.seq_record)
if self.category == "prophage" and len(self.prange):
seq_record.id += f" prophage-{self.prange[0]}:{self.prange[1]}"
seq_record.id += f"|prophage-{self.prange[0]}:{self.prange[1]}"
if self.circular:
seq_record.id += " phage-circular"
seq_record.id += "|phage-circular"
# clean
seq_record.description = ""
return seq_record

@staticmethod
def get_prophage_metadata_from_contig(contig_header):
"""Obtain the prophage start and end positions.
As well as if the prophage is circular.
This method returns None,None,None if the header doesn't
have a prophage.
:param contig_header: A fasta header
:return: (int) start, (int) end, (bool) circular
"""
circular = "phage-circular" in contig_header
if "prophage" in contig_header:
prange = contig_header.split("|")[1].replace("prophage-", "")
match = re.search(
"(?P<start>\d+):(?P<end>\d+)", prange, re.IGNORECASE
).groupdict()
start = int(match["start"])
end = int(match["end"])
return start, end, circular
return None, None, circular

@staticmethod
def remove_prophage_from_contig(contig_header):
"""Remove any prophage annotations from a contig header
:param contig_header: The contig header
:return: The same contig header without prophage annotations
"""
if "prophage" in contig_header:
contig_header = re.sub(r"\|prophage\-\d+\:\d+", "", contig_header)
if "phage-circular" in contig_header:
contig_header = contig_header.replace("|phage-circular", "")
return contig_header.replace("|", "")

def __hash__(self):
return hash(self.seq_id)

Expand All @@ -54,40 +86,42 @@ def __eq__(self, other):


def parse_pprmeta(file_name):
"""Extract phage hits from PPR-Meta.
"""
"""Extract phage hits from PPR-Meta."""
lc_ids = set()

if isfile(file_name):
result_df = pd.read_csv(file_name, sep=",")

lc_ids = set(result_df[
(result_df["Possible_source"] == "phage")
]["Header"].values)
lc_ids = set(
result_df[(result_df["Possible_source"] == "phage")]["Header"].values
)

print(f"PPR-Meta found {len(lc_ids)} low confidence contigs.")

return lc_ids


def parse_virus_finder(file_name):
"""Extract high and low confidence contigs from virus finder results.
"""
"""Extract high and low confidence contigs from virus finder results."""
hc_ids = set()
lc_ids = set()

if isfile(file_name):
result_df = pd.read_csv(file_name, sep="\t")

hc_ids = set(
result_df[(result_df["pvalue"] < 0.05) &
(result_df["score"] >= 0.90)]["name"].values)

lc_ids = set(result_df[
(result_df["pvalue"] < 0.05) &
(result_df["score"] >= 0.70) &
(result_df["score"] < 0.9)
]["name"].values)
result_df[(result_df["pvalue"] < 0.05) & (result_df["score"] >= 0.90)][
"name"
].values
)

lc_ids = set(
result_df[
(result_df["pvalue"] < 0.05)
& (result_df["score"] >= 0.70)
& (result_df["score"] < 0.9)
]["name"].values
)

print(f"Virus Finder found {len(hc_ids)} high confidence contigs.")
print(f"Virus Finder found {len(lc_ids)} low confidence contigs.")
Expand Down Expand Up @@ -140,7 +174,8 @@ def parse_virus_sorter(sorter_files):
elif category in ["4", "5"]:
# add the prophage position within the contig
prophages.setdefault(record.id, []).append(
Record(record, "prophage", circular, prange))
Record(record, "prophage", circular, prange)
)
else:
print(f"Contig has an invalid category : {category}")

Expand Down Expand Up @@ -192,15 +227,26 @@ def merge_annotations(pprmeta, finder, sorter, assembly):
elif seq_record.id in pprmeta_lc and seq_record.id in finder_lowestc:
lc_predictions_contigs.append(seq_record)

return hc_predictions_contigs, lc_predictions_contigs, prophage_predictions_contigs, \
sorter_hc, sorter_lc, sorter_prophages
return (
hc_predictions_contigs,
lc_predictions_contigs,
prophage_predictions_contigs,
sorter_hc,
sorter_lc,
sorter_prophages,
)


def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
"""Parse VirSorter, VirFinder and PPR-Meta outputs and merge the results.
"""
hc_contigs, lc_contigs, prophage_contigs, sorter_hc, sorter_lc, sorter_prophages = \
merge_annotations(pprmeta, finder, sorter, assembly)
"""Parse VirSorter, VirFinder and PPR-Meta outputs and merge the results."""
(
hc_contigs,
lc_contigs,
prophage_contigs,
sorter_hc,
sorter_lc,
sorter_prophages,
) = merge_annotations(pprmeta, finder, sorter, assembly)

at_least_one = False
name_prefix = ""
Expand All @@ -210,25 +256,37 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
outdir_path = Path(outdir)

if len(hc_contigs):
SeqIO.write(hc_contigs,
outdir / Path(name_prefix + "high_confidence_viral_contigs.fna"), "fasta")
SeqIO.write(
hc_contigs,
outdir / Path(name_prefix + "high_confidence_viral_contigs.fna"),
"fasta",
)
at_least_one = True
if len(lc_contigs):
SeqIO.write(lc_contigs,
outdir / Path(name_prefix + "low_confidence_viral_contigs.fna"), "fasta")
SeqIO.write(
lc_contigs,
outdir / Path(name_prefix + "low_confidence_viral_contigs.fna"),
"fasta",
)
at_least_one = True
if len(prophage_contigs):
SeqIO.write(prophage_contigs,
outdir / Path(name_prefix + "prophages.fna"), "fasta")
SeqIO.write(
prophage_contigs, outdir / Path(name_prefix + "prophages.fna"), "fasta"
)
at_least_one = True

# VirSorter provides some metadata on each annotation
# - is circular
# - prophage start and end within a contig
if sorter_hc or sorter_lc or sorter_prophages:
with open(outdir_path / Path("virsorter_metadata.tsv"), "w") as pm_tsv_file:
header = ["contig", "category", "circular",
"prophage_start", "prophage_end"]
header = [
"contig",
"category",
"circular",
"prophage_start",
"prophage_end",
]
tsv_writer = csv.writer(pm_tsv_file, delimiter="\t")
tsv_writer.writerow(header)
tsv_writer.writerows([shc.to_tsv() for _, shc in sorter_hc.items()])
Expand All @@ -237,8 +295,11 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
tsv_writer.writerows([ph.to_tsv() for ph in plist])

if not at_least_one:
print("Overall, no putative _viral contigs or prophages were detected"
" in the analysed metagenomic assembly", file=sys.stderr)
print(
"Overall, no putative _viral contigs or prophages were detected"
" in the analysed metagenomic assembly",
file=sys.stderr,
)
exit(1)


Expand All @@ -250,25 +311,59 @@ def main(pprmeta, finder, sorter, assembly, outdir, prefix=False):
"""
parser = argparse.ArgumentParser(
description="Write fasta files with predicted _viral contigs sorted in "
"categories and putative prophages")
parser.add_argument("-a", "--assemb", dest="assembly",
help="Metagenomic assembly fasta file", required=True)
parser.add_argument("-f", "--vfout", dest="finder", help="Absolute or "
"relative path to VirFinder output file",
required=False)
parser.add_argument("-s", "--vsfiles", dest="sorter", nargs='+',
help="VirSorter .fasta files (i.e. VIRSorter_cat-1.fasta)"
" VirSorter output", required=False)
parser.add_argument("-p", "--pmout", dest="pprmeta",
help="Absolute or relative path to PPR-Meta output file"
" PPR-Meta output", required=False)
parser.add_argument("-r", "--prefix", dest="prefix",
help="Use the assembly filename as prefix for the outputs",
action="store_true")
parser.add_argument("-o", "--outdir", dest="outdir",
help="Absolute or relative path of directory where output"
" _viral prediction files should be stored (default: cwd)",
default=".")
"categories and putative prophages"
)
parser.add_argument(
"-a",
"--assemb",
dest="assembly",
help="Metagenomic assembly fasta file",
required=True,
)
parser.add_argument(
"-f",
"--vfout",
dest="finder",
help="Absolute or " "relative path to VirFinder output file",
required=False,
)
parser.add_argument(
"-s",
"--vsfiles",
dest="sorter",
nargs="+",
help="VirSorter .fasta files (i.e. VIRSorter_cat-1.fasta)" " VirSorter output",
required=False,
)
parser.add_argument(
"-p",
"--pmout",
dest="pprmeta",
help="Absolute or relative path to PPR-Meta output file" " PPR-Meta output",
required=False,
)
parser.add_argument(
"-r",
"--prefix",
dest="prefix",
help="Use the assembly filename as prefix for the outputs",
action="store_true",
)
parser.add_argument(
"-o",
"--outdir",
dest="outdir",
help="Absolute or relative path of directory where output"
" _viral prediction files should be stored (default: cwd)",
default=".",
)
args = parser.parse_args()

main(args.pprmeta, args.finder, args.sorter, args.assembly, args.outdir, prefix=args.prefix)
main(
args.pprmeta,
args.finder,
args.sorter,
args.assembly,
args.outdir,
prefix=args.prefix,
)
Loading

0 comments on commit aaf02d6

Please sign in to comment.