Skip to content

Commit

Permalink
feat: upgrade to antiSMASH 7
Browse files Browse the repository at this point in the history
* feat: upgrade to antismash 7

* feat: upgrade gtdbtk and gtdb to release 214

* chore: move default logs location

* chore: make an alias for rules/pipelines and name/pep

* fix: grab ARTS from release instead of git clone

* fix: correct ARTS setup

* feat: upgrade bigslice for compatibility with antiSMASH7

* test: update test for 0.7.0

* fix: enable ani_screen in gtdbtk

* fix: correct gtdb release versioning

* chore: set ani_screen off as default for gtdbtk

* docs: add quickstart video

* feat: upgrade extraction from mmseqs2 and clinker

* feat: process mmseqs2 cog feature

* fix: handle non standard BGC genbanks

* feat: enable to switch between antismash 7 and 6

* docs: mention about the WIKI
  • Loading branch information
matinnuhamunada authored Jul 7, 2023
1 parent f3fa8af commit 1b1a5b1
Show file tree
Hide file tree
Showing 127 changed files with 2,587 additions and 1,841 deletions.
33 changes: 21 additions & 12 deletions .examples/_config_example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,22 @@
# - prokka-db (optional): list of the custom accessions to use as prokka reference database.
# - gtdb-tax (optional): output summary file of GTDB-tk with "user_genome" and "classification" as the two minimum columns
# (2) To define project using PEP file, only variable "name" should be given that points to the location of the PEP yaml file.
# - name: path to PEP .yaml file. See project example_pep for details.
# - pep: path to PEP .yaml file. See project example_pep for details.
# PS: the variable pep and name is an alias

projects:
# Project 1 (yaml object)
- name: example
samples: .examples/_genome_project_example/samples.csv
rules: .examples/_genome_project_example/project_config.yaml
prokka-db: .examples/_genome_project_example/prokka-db.csv
gtdb-tax: .examples/_genome_project_example/gtdbtk.bac120.summary.tsv

# Project 2 (PEP file)
- name: .examples/_pep_example/project_config.yaml
- pep: .examples/_pep_example/project_config.yaml
- pep: .examples/lactobacillus_delbruecki/project_config.yaml

bgc_projects:
- name: config/lanthipeptide/project_config.yaml
- pep: .examples/lanthipeptide_lactobacillus/project_config.yaml

#### GLOBAL RULE CONFIGURATION ####
# This section configures the rules to run globally.
# Use project specific rule configurations if you want to run different rules for each projects.
# rules: set value to TRUE if you want to run the analysis or FALSE if you don't
rules:
# pipelines or rules: set value to TRUE if you want to run the analysis or FALSE if you don't
pipelines:
seqfu: FALSE
mash: FALSE
fastani: FALSE
Expand Down Expand Up @@ -63,3 +58,17 @@ resources_path:
checkm: resources/checkm
gtdbtk: resources/gtdbtk
#RNAmmer: resources/RNAmmer # If specified, will override Barnapp in Prokka

# Check for valid release versions from https://data.gtdb.ecogenomic.org/releases/
# Examples:
# - release 214.1, release_version r214 --> https://data.gtdb.ecogenomic.org/releases/release214/214.1/auxillary_files/gtdbtk_r214_data.tar.gz
# - release 207, release_version r207_v2 --> https://data.gtdb.ecogenomic.org/releases/release207/207.0/auxillary_files/gtdbtk_r207_v2_data.tar.gz

rule_parameters:
install_gtdbtk:
release: "214.1"
release_version: "r214"
gtdbtk:
ani_screen: FALSE
antismash:
version: "7" # valid versions: 6, 7
5 changes: 0 additions & 5 deletions .examples/lanthipeptide/df_antismash_6.1.1_bgc.csv

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
bgc_id,genome_id,region,accession,start_pos,end_pos,contig_edge,product,region_length,source,gbk_path
CR954253.1.region001,GCA_000056065.1,1.1,CR954253.1,17407,39909,False,['lanthipeptide-class-iii'],22502,bgcflow,data/interim/antismash/6.1.1/GCA_000056065.1/CR954253.1.region001.gbk
CR954253.1.region002,GCA_000056065.1,1.2,CR954253.1,1745672,1767868,False,['lanthipeptide-class-iv'],22196,bgcflow,data/interim/antismash/6.1.1/GCA_000056065.1/CR954253.1.region002.gbk
CP000156.1.region001,GCA_000191165.1,1.1,CP000156.1,1767251,1789447,False,['lanthipeptide-class-iv'],22196,bgcflow,data/interim/antismash/6.1.1/GCA_000191165.1/CP000156.1.region001.gbk
CP000412.1.region001,GCA_000014405.1,1.1,CP000412.1,17283,39785,False,['lanthipeptide-class-iii'],22502,bgcflow,data/interim/antismash/6.1.1/GCA_000014405.1/CP000412.1.region001.gbk
CR954253.1.region001,GCA_000056065.1,1.1,CR954253.1,17407,39909,False,['lanthipeptide-class-iii'],22502,bgcflow,data/interim/antismash/7.0.0/GCA_000056065.1/CR954253.1.region001.gbk
CR954253.1.region003,GCA_000056065.1,1.3,CR954253.1,1745672,1767868,False,['lanthipeptide-class-iv'],22196,bgcflow,data/interim/antismash/7.0.0/GCA_000056065.1/CR954253.1.region003.gbk
CP000156.1.region002,GCA_000191165.1,1.2,CP000156.1,1767251,1789447,False,['lanthipeptide-class-iv'],22196,bgcflow,data/interim/antismash/7.0.0/GCA_000191165.1/CP000156.1.region002.gbk
CP000412.1.region001,GCA_000014405.1,1.1,CP000412.1,17283,39785,False,['lanthipeptide-class-iii'],22502,bgcflow,data/interim/antismash/7.0.0/GCA_000014405.1/CP000412.1.region001.gbk
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
name: lanthipeptide_lactobacillus
pep_version: 2.1.0
description: 'A selection of lanthipeptides from Lactobacillus delbrueckii'
sample_table: df_antismash_6.1.1_bgc.csv
sample_table: df_regions_antismash_7.0.0.csv

rules:
bigslice: TRUE
bigscape: TRUE
query-bigslice: TRUE
clinker: TRUE
interproscan: TRUE
mmseqs2: TRUE
1 change: 1 addition & 0 deletions .github/workflows/push.yml
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ jobs:
python-version: 3.x
- run: pip install git+https://github.com/NBChub/bgcflow_wrapper.git
- run: pip install pytest-cov
- run: pip install alive-progress
- name: Test coverage
run: pytest --cov=.tests/unit .tests/unit/
- name: Build coverage file
Expand Down
10 changes: 5 additions & 5 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.1.13
rev: v1.5.1
hooks:
- id: forbid-crlf
- id: remove-crlf
Expand All @@ -9,7 +9,7 @@ repos:
- id: remove-tabs
exclude_types: [csv]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.1.0
rev: v4.3.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
Expand All @@ -21,12 +21,12 @@ repos:
hooks:
- id: isort
- repo: https://github.com/ambv/black
rev: 22.3.0
rev: 23.3.0
hooks:
- id: black
language_version: python3.8
language_version: python3
- repo: https://github.com/PyCQA/flake8
rev: 3.9.2
rev: 5.0.4
hooks:
- id: flake8
args: ['--ignore=E501,W503']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,21 @@
# - prokka-db (optional): list of the custom accessions to use as prokka reference database.
# - gtdb-tax (optional): output summary file of GTDB-tk with "user_genome" and "classification" as the two minimum columns
# (2) To define project using PEP file, only variable "name" should be given that points to the location of the PEP yaml file.
# - name: path to PEP .yaml file. See project example_pep for details.
# - pep: path to PEP .yaml file. See project example_pep for details.
# PS: the variable pep and name is an alias

projects:
# Project 1 (yaml object)
# - name: example
# samples: config/_genome_project_example/samples.csv
# rules: config/_genome_project_example/project_config.yaml
# prokka-db: config/_genome_project_example/prokka-db.csv
# gtdb-tax: config/_genome_project_example/gtdbtk.bac120.summary.tsv

# Project 2 (PEP file)
# - name: config/_pep_example/project_config.yaml
- name: config/lactobacillus_delbruecki/project_config.yaml

bgc_projects:
- pep: config/lanthipeptide/project_config.yaml

#### GLOBAL RULE CONFIGURATION ####
# This section configures the rules to run globally.
# Use project specific rule configurations if you want to run different rules for each projects.
# rules: set value to TRUE if you want to run the analysis or FALSE if you don't
rules:
# pipelines or rules: set value to TRUE if you want to run the analysis or FALSE if you don't
pipelines:
seqfu: FALSE
mash: FALSE
fastani: FALSE
Expand Down Expand Up @@ -60,3 +57,8 @@ resources_path:
checkm: resources/checkm
gtdbtk: resources/gtdbtk
#RNAmmer: resources/RNAmmer # If specified, will override Barnapp in Prokka

rule_parameters:
install_gtdbtk:
release: 214
release_version: 214

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,18 @@
"genome_id": "GCA_000056065.1",
"region": "1.2",
"accession": "CR954253.1",
"start_pos": "125153",
"end_pos": "136119",
"contig_edge": "False",
"product": [
"RiPP-like"
],
"region_length": 10966
},
"CR954253.1.region003": {
"genome_id": "GCA_000056065.1",
"region": "1.3",
"accession": "CR954253.1",
"start_pos": "1745672",
"end_pos": "1767868",
"contig_edge": "False",
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

64 changes: 64 additions & 0 deletions .tests/unit/antismash_summary/data/config/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# This file should contain everything to configure the workflow on a global scale.

#### PROJECT INFORMATION ####
# This section control your project configuration.
# Each project are separated by "-".
# A project can be defined as (1) a yaml object or (2) a Portable Encapsulated Project (PEP) file.
# (1) To define project as a yaml object, it must contain the variable "name" and "samples".
# - name : name of your project
# - samples : a csv file containing a list of genome ids for analysis with multiple sources mentioned. Genome ids must be unique.
# - rules: a yaml file containing project rule configurations. This will override global rule configuration.
# - prokka-db (optional): list of the custom accessions to use as prokka reference database.
# - gtdb-tax (optional): output summary file of GTDB-tk with "user_genome" and "classification" as the two minimum columns
# (2) To define project using PEP file, only variable "name" should be given that points to the location of the PEP yaml file.
# - pep: path to PEP .yaml file. See project example_pep for details.
# PS: the variable pep and name is an alias

projects:
# Project 1 (yaml object)
- name: config/lactobacillus_delbruecki/project_config.yaml

bgc_projects:
- pep: config/lanthipeptide/project_config.yaml

#### GLOBAL RULE CONFIGURATION ####
# This section configures the rules to run globally.
# Use project specific rule configurations if you want to run different rules for each projects.
# pipelines or rules: set value to TRUE if you want to run the analysis or FALSE if you don't
pipelines:
seqfu: FALSE
mash: FALSE
fastani: FALSE
checkm: FALSE
gtdbtk: FALSE
prokka-gbk: FALSE
antismash: TRUE
query-bigslice: FALSE
bigscape: FALSE
bigslice: FALSE
automlst-wrapper: FALSE
arts: FALSE
roary: FALSE
eggnog: FALSE
eggnog-roary: FALSE
deeptfactor: FALSE
deeptfactor-roary: FALSE
cblaster-genome: FALSE
cblaster-bgc: FALSE

#### RESOURCES CONFIGURATION ####
# resources : the location of the resources to run the rule.
# The default location is at "resources/{resource_name}".
resources_path:
antismash_db: resources/antismash_db
eggnog_db: resources/eggnog_db
BiG-SCAPE: resources/BiG-SCAPE
bigslice: resources/bigslice
checkm: resources/checkm
gtdbtk: resources/gtdbtk
#RNAmmer: resources/RNAmmer # If specified, will override Barnapp in Prokka

rule_parameters:
install_gtdbtk:
release: 214
release_version: 214
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Lactobacillus_delbrueckii

pep_version: 2.1.0

description: "Lactobacillus delbrueckii 27 01 2023"

sample_table: samples.csv

#### RULE CONFIGURATION ####
# rules: set value to TRUE if you want to run the analysis or FALSE if you don't
rules:
seqfu: TRUE
mash: TRUE
fastani: TRUE
checkm: FALSE
gtdbtk: FALSE
prokka-gbk: TRUE
antismash: TRUE
query-bigslice: TRUE
bigscape: TRUE
bigslice: TRUE
automlst-wrapper: TRUE
arts: TRUE
roary: TRUE
eggnog: TRUE
eggnog-roary: TRUE
deeptfactor: TRUE
deeptfactor-roary: TRUE
cblaster-genome: TRUE
cblaster-bgc: TRUE
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,3 @@ GCA_000056065.1,ncbi,,,,,,
GCA_000182835.1,ncbi,,,,,,
GCA_000191165.1,ncbi,,,,,,
GCA_000014405.1,ncbi,,,,,,
GCF_024734405.1,ncbi,,,,,,
GCF_025643595.1,ncbi,,,,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"GCA_000014405.1": {
"bgcs_count": 2,
"bgcs_on_contig_edge": 0,
"protoclusters_count": 0,
"cand_clusters_count": 0,
"products": {
"lanthipeptide-class-iii": 1,
"RiPP-like": 1
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"GCA_000056065.1": {
"bgcs_count": 3,
"bgcs_on_contig_edge": 0,
"protoclusters_count": 0,
"cand_clusters_count": 0,
"products": {
"lanthipeptide-class-iii": 1,
"RiPP-like": 1,
"lanthipeptide-class-iv": 1
}
}
}
Loading

0 comments on commit 1b1a5b1

Please sign in to comment.