-
Notifications
You must be signed in to change notification settings - Fork 9
/
bgc_selection.smk
64 lines (63 loc) · 3.4 KB
/
bgc_selection.smk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
def get_bgc_inputs(pep_object, antismash_version):
"""
Given a PEP Object, get all genbank files based on the bgc_id column
"""
antismash_path = Path(f"data/interim/antismash/{antismash_version}")
gbk_list = []
df = pep_object.sample_tables
for i in df.index:
bgc_id = df.loc[i, "bgc_id"]
genome_id = df.loc[i, "genome_id"]
# override with custom path
assert 'gbk_path' in df.columns
custom_path = df.loc[i, "gbk_path"]
print(custom_path, type(custom_path), custom_path != None, file=sys.stderr)
if custom_path != None:
gbk_path = custom_path
else:
gbk_path = antismash_path / genome_id / f"{bgc_id}.gbk"
print(bgc_id, gbk_path, file=sys.stderr)
gbk_list.append(gbk_path)
return gbk_list
rule downstream_bgc_prep_selection:
input:
gbk=lambda wildcards: get_bgc_inputs(PEP_PROJECTS[wildcards.name], wildcards.version),
table="data/processed/{name}/tables/df_gtdb_meta.csv",
output:
taxonomy="data/interim/bgcs/taxonomy/taxonomy_{name}_antismash_{version}.tsv",
outdir=directory("data/interim/bgcs/{name}/{version}"),
bgc_mapping="data/interim/bgcs/{name}/{name}_antismash_{version}.csv",
conda:
"../envs/bgc_analytics.yaml"
params:
dataset="data/interim/bgcs/datasets.tsv",
log:
general="workflow/report/logs/bgcs/downstream_bgc_prep/{name}/downstream_bgc_prep-{version}.log",
symlink="workflow/report/logs/bgcs/downstream_bgc_prep/{name}/bgc_downstream_bgc_prep-{version}.log",
taxonomy="workflow/report/logs/bgcs/downstream_bgc_prep/{name}/tax_downstream_bgc_prep-{version}.log",
shell:
"""
echo "Preparing BGCs for {wildcards.name} downstream analysis..." 2>> {log.general}
#mkdir -p {output.outdir} 2>> {log.general}
# Generate symlink for each regions in genomes in dataset
for i in $(dirname {input.gbk})
do
echo Processing $i 2>> {log.symlink}
python workflow/bgcflow/bgcflow/data/bgc_downstream_prep.py $i {output.outdir} 2>> {log.symlink}
done
# generate taxonomic information for dataset
python workflow/bgcflow/bgcflow/data/bigslice_prep.py {input.table} {output.taxonomy} 2>> {log.taxonomy}
# append new dataset information
## check if previous dataset exists
if [[ -s {params.dataset} ]]
then
echo "Previous dataset detected, appending dataset information for {wildcards.name}..."
sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log.general}
else
echo "No previous dataset detected, generating dataset information for {wildcards.name}..."
echo -e '# Dataset name\tPath to folder\tPath to taxonomy\tDescription' > {params.dataset} 2>> {log.general}
sed -i 'a {wildcards.name}_antismash_{wildcards.version}\t{wildcards.name}_antismash_{wildcards.version}\ttaxonomy/taxonomy_{wildcards.name}_antismash_{wildcards.version}.tsv\t{wildcards.name}' {params.dataset} 2>> {log.general}
fi
# generate mapping for visualization
python workflow/bgcflow/bgcflow/data/get_bigscape_mapping.py {output.outdir} {output.bgc_mapping} 2>> {log.general}
"""