Skip to content

Commit

Permalink
Merge pull request #444 from jfy133/busco-script
Browse files Browse the repository at this point in the history
Move BUSCO bash code a script
  • Loading branch information
jfy133 authored May 25, 2023
2 parents ab3422b + 0dfad70 commit 75b3b37
Show file tree
Hide file tree
Showing 3 changed files with 161 additions and 145 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel, fix by @jfy133)
- [#441](https://github.com/nf-core/mag/pull/441) - Deactivated CONCOCT in AWS 'full test' due to very long runtime (fix by @jfy133).
- [#442](https://github.com/nf-core/mag/pull/442) - Remove warning when BUSCO finds no genes in bins, as this can be expected in some datasets (reported by @Lumimar, fix by @jfy133).
- [#444](https://github.com/nf-core/mag/pull/444) - Moved BUSCO bash code to script (by @jfy133)

### `Fixed`

Expand Down
158 changes: 158 additions & 0 deletions bin/run_busco.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
#! /usr/bin/env bash

p=$1
cp_augustus_config=$2
db=$3
bin=$4
task_cpus=$5
lineage_dataset_provided=$6
busco_clean=$7

# ensure augustus has write access to config directory
if [ ${cp_augustus_config} = "Y" ]; then
cp -r /usr/local/config/ augustus_config/
export AUGUSTUS_CONFIG_PATH=augustus_config
fi

# place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it)
if [ ${lineage_dataset_provided} = "Y" ]; then
mkdir dataset
mv ${db} dataset/
fi

# set nullgob: if pattern matches no files, expand to a null string rather than to itself
shopt -s nullglob

# only used for saving busco downloads
most_spec_db="NA"

if busco ${p} \
--mode genome \
--in ${bin} \
--cpu ${task_cpus} \
--out "BUSCO" >${bin}_busco.log 2>${bin}_busco.err; then

# get name of used specific lineage dataset
summaries=(BUSCO/short_summary.specific.*.BUSCO.txt)
if [ ${#summaries[@]} -ne 1 ]; then
echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one."
exit 1
fi
[[ $summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]]
db_name_spec="${BASH_REMATCH[1]}"
most_spec_db=${db_name_spec}
echo "Used specific lineage dataset: ${db_name_spec}"

if [ ${lineage_dataset_provided} = "Y" ]; then
cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt

# if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting
# report bin as failed to allow consistent warnings within the pipeline for both settings
if egrep -q $'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log; then
echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log."
echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt"
fi
else
# auto lineage selection
if { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log &&
egrep -q $'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log; } ||
{ egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log &&
egrep -q $'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log &&
egrep -q $'INFO:\tUsing local lineages directory ' ${bin}_busco.log; }; then
# the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately
# and cause different log messages
echo "Domain and specific lineage could be selected by BUSCO."
cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt

db_name_gen=""
summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt)
if [ ${#summaries_gen[@]} -lt 1 ]; then
echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same."
cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt
db_name_gen=${db_name_spec}
else
[[ $summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]]
db_name_gen="${BASH_REMATCH[1]}"
echo "Used generic lineage dataset: ${db_name_gen}"
cp BUSCO/short_summary.generic.${db_name_gen}.BUSCO.txt short_summary.domain.${db_name_gen}.${bin}.txt
fi

for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do
cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz
break
done
for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do
cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz
break
done

elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log; then
echo "Domain could be selected by BUSCO, but no more specific lineage."
cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt

elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log; then
echo "Domain could be selected by BUSCO, but no more specific lineage."
cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt

elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tRunning virus detection pipeline' ${bin}_busco.log; then
# TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*?
echo "Domain could not be selected by BUSCO, but virus dataset was selected."
cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt
else
echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2
exit 1
fi
fi

for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do
cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_spec}.faa.gz
break
done
for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do
cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_spec}.fna.gz
break
done

elif egrep -q $'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err; then
echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err."
echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt"

elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'ERROR:\tPlacements failed' ${bin}_busco.err; then
echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset."
echo -e "${bin}\tPlacements failed" >"${bin}_busco.failed_bin.txt"

message=$(egrep $'INFO:\t\\S+ selected' ${bin}_busco.log)
[[ $message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]]
db_name_gen="${BASH_REMATCH[1]}"
most_spec_db=${db_name_gen}
echo "Used generic lineage dataset: ${db_name_gen}"
cp BUSCO/auto_lineage/run_${db_name_gen}/short_summary.txt short_summary.domain.${db_name_gen}.${bin}.txt

for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do
cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz
break
done
for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do
cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz
break
done

else
echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2
exit 1
fi

# additionally output genes predicted with Prodigal (GFF3)
if [ -f BUSCO/logs/prodigal_out.log ]; then
mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff"
fi

# output value of most_spec_db
echo ${most_spec_db} > info_most_spec_db.txt

# if needed delete temporary BUSCO files
if [ ${busco_clean} = "Y" ]; then
find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \;
find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \;
find . -depth -type d -name "run_*" -execdir rm -rf "{}" +
fi
147 changes: 2 additions & 145 deletions modules/local/busco.nf
Original file line number Diff line number Diff line change
Expand Up @@ -38,151 +38,8 @@ process BUSCO {
p += " --offline --download_path ${download_folder}"
}
"""
# ensure augustus has write access to config directory
if [ ${cp_augustus_config} = "Y" ] ; then
cp -r /usr/local/config/ augustus_config/
export AUGUSTUS_CONFIG_PATH=augustus_config
fi
# place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it)
if [ ${lineage_dataset_provided} = "Y" ] ; then
mkdir dataset
mv ${db} dataset/
fi
# set nullgob: if pattern matches no files, expand to a null string rather than to itself
shopt -s nullglob
# only used for saving busco downloads
most_spec_db="NA"
if busco ${p} \
--mode genome \
--in ${bin} \
--cpu "${task.cpus}" \
--out "BUSCO" > ${bin}_busco.log 2> ${bin}_busco.err; then
# get name of used specific lineage dataset
summaries=(BUSCO/short_summary.specific.*.BUSCO.txt)
if [ \${#summaries[@]} -ne 1 ]; then
echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one."
exit 1
fi
[[ \$summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]];
db_name_spec="\${BASH_REMATCH[1]}"
most_spec_db=\${db_name_spec}
echo "Used specific lineage dataset: \${db_name_spec}"
if [ ${lineage_dataset_provided} = "Y" ]; then
cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt
# if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting
# report bin as failed to allow consistent warnings within the pipeline for both settings
if egrep -q \$'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log ; then
echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log."
echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt"
fi
else
# auto lineage selection
if { egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log \
&& egrep -q \$'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log ; } || \
{ egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log \
&& egrep -q \$'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log \
&& egrep -q \$'INFO:\tUsing local lineages directory ' ${bin}_busco.log ; }; then
# the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately
# and cause different log messages
echo "Domain and specific lineage could be selected by BUSCO."
cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt
db_name_gen=""
summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt)
if [ \${#summaries_gen[@]} -lt 1 ]; then
echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same."
cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt
db_name_gen=\${db_name_spec}
else
[[ \$summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]];
db_name_gen="\${BASH_REMATCH[1]}"
echo "Used generic lineage dataset: \${db_name_gen}"
cp BUSCO/short_summary.generic.\${db_name_gen}.BUSCO.txt short_summary.domain.\${db_name_gen}.${bin}.txt
fi
for f in BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do
cat BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_gen}.faa.gz
break
done
for f in BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do
cat BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_gen}.fna.gz
break
done
elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log ; then
echo "Domain could be selected by BUSCO, but no more specific lineage."
cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt
elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log ; then
echo "Domain could be selected by BUSCO, but no more specific lineage."
cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt
elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tRunning virus detection pipeline' ${bin}_busco.log ; then
# TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*?
echo "Domain could not be selected by BUSCO, but virus dataset was selected."
cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt
else
echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2
exit 1
fi
fi
for f in BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do
cat BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_spec}.faa.gz
break
done
for f in BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do
cat BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_spec}.fna.gz
break
done
elif egrep -q \$'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err ; then
echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err."
echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt"
elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'ERROR:\tPlacements failed' ${bin}_busco.err ; then
echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset."
echo -e "${bin}\tPlacements failed" > "${bin}_busco.failed_bin.txt"
message=\$(egrep \$'INFO:\t\\S+ selected' ${bin}_busco.log)
[[ \$message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]];
db_name_gen="\${BASH_REMATCH[1]}"
most_spec_db=\${db_name_gen}
echo "Used generic lineage dataset: \${db_name_gen}"
cp BUSCO/auto_lineage/run_\${db_name_gen}/short_summary.txt short_summary.domain.\${db_name_gen}.${bin}.txt
for f in BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do
cat BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_gen}.faa.gz
break
done
for f in BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do
cat BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_gen}.fna.gz
break
done
else
echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2
exit 1
fi
# additionally output genes predicted with Prodigal (GFF3)
if [ -f BUSCO/logs/prodigal_out.log ]; then
mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff"
fi
# if needed delete temporary BUSCO files
if [ ${busco_clean} = "Y" ]; then
find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \\;
find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \\;
find . -depth -type d -name "run_*" -execdir rm -rf "{}" +
fi
run_busco.sh "${p}" "${cp_augustus_config}" "${db}" "${bin}" ${task.cpus} "${lineage_dataset_provided}" "${busco_clean}"
most_spec_db=\$(<info_most_spec_db.txt)
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down

0 comments on commit 75b3b37

Please sign in to comment.