From 8324ba7c5b9b666408730137ba964a4628c796af Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Tue, 16 May 2023 19:47:17 +0200 Subject: [PATCH 1/4] Move BUSCO code to a bash script --- CHANGELOG.md | 1 + bin/run_busco.sh | 155 +++++++++++++++++++++++++++++++++++++++++ modules/local/busco.nf | 146 +------------------------------------- 3 files changed, 157 insertions(+), 145 deletions(-) create mode 100755 bin/run_busco.sh diff --git a/CHANGELOG.md b/CHANGELOG.md index 5a874fa8..6a7fc24b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#428](https://github.com/nf-core/mag/pull/428) - Update to nf-core 2.8 `TEMPLATE` (by @jfy133) - [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel and fixed by @jfy133) +- [#443](https://github.com/nf-core/mag/pull/443) - Moved BUSCO bash code to script (by @jfy133) ### `Fixed` diff --git a/bin/run_busco.sh b/bin/run_busco.sh new file mode 100755 index 00000000..a1362202 --- /dev/null +++ b/bin/run_busco.sh @@ -0,0 +1,155 @@ +#! /usr/bin/env bash + +p=$1 +cp_augustus_config=$2 +db=$3 +bin=$4 +task_cpus=$5 +lineage_dataset_provided=$6 +busco_clean=$7 + +# ensure augustus has write access to config directory +if [ ${cp_augustus_config} = "Y" ] ; then + cp -r /usr/local/config/ augustus_config/ + export AUGUSTUS_CONFIG_PATH=augustus_config +fi + +# place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it) +if [ ${lineage_dataset_provided} = "Y" ] ; then + mkdir dataset + mv ${db} dataset/ +fi + +# set nullgob: if pattern matches no files, expand to a null string rather than to itself +shopt -s nullglob + +# only used for saving busco downloads +most_spec_db="NA" + +if busco ${p} \ + --mode genome \ + --in ${bin} \ + --cpu ${task_cpus} \ + --out "BUSCO" > ${bin}_busco.log 2> ${bin}_busco.err; then + + # get name of used specific lineage dataset + summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) + if [ ${#summaries[@]} -ne 1 ]; then + echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." + exit 1 + fi + [[ $summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]]; + db_name_spec="${BASH_REMATCH[1]}" + most_spec_db=${db_name_spec} + echo "Used specific lineage dataset: ${db_name_spec}" + + if [ ${lineage_dataset_provided} = "Y" ]; then + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + + # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting + # report bin as failed to allow consistent warnings within the pipeline for both settings + if egrep -q $'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log ; then + echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log." + echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" + fi + else + # auto lineage selection + if { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log \ + && egrep -q $'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log ; } || \ + { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log \ + && egrep -q $'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log \ + && egrep -q $'INFO:\tUsing local lineages directory ' ${bin}_busco.log ; }; then + # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately + # and cause different log messages + echo "Domain and specific lineage could be selected by BUSCO." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + + db_name_gen="" + summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt) + if [ ${#summaries_gen[@]} -lt 1 ]; then + echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + db_name_gen=${db_name_spec} + else + [[ $summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]]; + db_name_gen="${BASH_REMATCH[1]}" + echo "Used generic lineage dataset: ${db_name_gen}" + cp BUSCO/short_summary.generic.${db_name_gen}.BUSCO.txt short_summary.domain.${db_name_gen}.${bin}.txt + fi + + for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz + break + done + for f in BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz + break + done + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log ; then + echo "Domain could be selected by BUSCO, but no more specific lineage." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log ; then + echo "Domain could be selected by BUSCO, but no more specific lineage." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt + + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tRunning virus detection pipeline' ${bin}_busco.log ; then + # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*? + echo "Domain could not be selected by BUSCO, but virus dataset was selected." + cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt + else + echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2 + exit 1 + fi + fi + + for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_spec}.faa.gz + break + done + for f in BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/run_${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_spec}.fna.gz + break + done + +elif egrep -q $'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err ; then + echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err." + echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" + +elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'ERROR:\tPlacements failed' ${bin}_busco.err ; then + echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset." + echo -e "${bin}\tPlacements failed" > "${bin}_busco.failed_bin.txt" + + message=$(egrep $'INFO:\t\\S+ selected' ${bin}_busco.log) + [[ $message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]]; + db_name_gen="${BASH_REMATCH[1]}" + most_spec_db=${db_name_gen} + echo "Used generic lineage dataset: ${db_name_gen}" + cp BUSCO/auto_lineage/run_${db_name_gen}/short_summary.txt short_summary.domain.${db_name_gen}.${bin}.txt + + for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do + cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.${db_name_gen}.faa.gz + break + done + for f in BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do + cat BUSCO/auto_lineage/run_${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.${db_name_gen}.fna.gz + break + done + +else + echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2 + exit 1 +fi + +# additionally output genes predicted with Prodigal (GFF3) +if [ -f BUSCO/logs/prodigal_out.log ]; then + mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff" +fi + +# if needed delete temporary BUSCO files +if [ ${busco_clean} = "Y" ]; then + find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \\; + find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \\; + find . -depth -type d -name "run_*" -execdir rm -rf "{}" + +fi diff --git a/modules/local/busco.nf b/modules/local/busco.nf index 639f4685..f84e72f8 100644 --- a/modules/local/busco.nf +++ b/modules/local/busco.nf @@ -38,151 +38,7 @@ process BUSCO { p += " --offline --download_path ${download_folder}" } """ - # ensure augustus has write access to config directory - if [ ${cp_augustus_config} = "Y" ] ; then - cp -r /usr/local/config/ augustus_config/ - export AUGUSTUS_CONFIG_PATH=augustus_config - fi - - # place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it) - if [ ${lineage_dataset_provided} = "Y" ] ; then - mkdir dataset - mv ${db} dataset/ - fi - - # set nullgob: if pattern matches no files, expand to a null string rather than to itself - shopt -s nullglob - - # only used for saving busco downloads - most_spec_db="NA" - - if busco ${p} \ - --mode genome \ - --in ${bin} \ - --cpu "${task.cpus}" \ - --out "BUSCO" > ${bin}_busco.log 2> ${bin}_busco.err; then - - # get name of used specific lineage dataset - summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) - if [ \${#summaries[@]} -ne 1 ]; then - echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." - exit 1 - fi - [[ \$summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]]; - db_name_spec="\${BASH_REMATCH[1]}" - most_spec_db=\${db_name_spec} - echo "Used specific lineage dataset: \${db_name_spec}" - - if [ ${lineage_dataset_provided} = "Y" ]; then - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt - - # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting - # report bin as failed to allow consistent warnings within the pipeline for both settings - if egrep -q \$'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log ; then - echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log." - echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" - fi - else - # auto lineage selection - if { egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log \ - && egrep -q \$'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log ; } || \ - { egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log \ - && egrep -q \$'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log \ - && egrep -q \$'INFO:\tUsing local lineages directory ' ${bin}_busco.log ; }; then - # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately - # and cause different log messages - echo "Domain and specific lineage could be selected by BUSCO." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt - - db_name_gen="" - summaries_gen=(BUSCO/short_summary.generic.*.BUSCO.txt) - if [ \${#summaries_gen[@]} -lt 1 ]; then - echo "No 'BUSCO/short_summary.generic.*.BUSCO.txt' file found. Assuming selected domain and specific lineages are the same." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt - db_name_gen=\${db_name_spec} - else - [[ \$summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]]; - db_name_gen="\${BASH_REMATCH[1]}" - echo "Used generic lineage dataset: \${db_name_gen}" - cp BUSCO/short_summary.generic.\${db_name_gen}.BUSCO.txt short_summary.domain.\${db_name_gen}.${bin}.txt - fi - - for f in BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_gen}.faa.gz - break - done - for f in BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_gen}.fna.gz - break - done - - elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log ; then - echo "Domain could be selected by BUSCO, but no more specific lineage." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt - - elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log ; then - echo "Domain could be selected by BUSCO, but no more specific lineage." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.domain.\${db_name_spec}.${bin}.txt - - elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'INFO:\tRunning virus detection pipeline' ${bin}_busco.log ; then - # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*? - echo "Domain could not be selected by BUSCO, but virus dataset was selected." - cp BUSCO/short_summary.specific.\${db_name_spec}.BUSCO.txt short_summary.specific_lineage.\${db_name_spec}.${bin}.txt - else - echo "ERROR: Some not expected case occurred! See ${bin}_busco.log." >&2 - exit 1 - fi - fi - - for f in BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_spec}.faa.gz - break - done - for f in BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/run_\${db_name_spec}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_spec}.fna.gz - break - done - - elif egrep -q \$'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err ; then - echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err." - echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" - - elif egrep -q \$'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q \$'ERROR:\tPlacements failed' ${bin}_busco.err ; then - echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset." - echo -e "${bin}\tPlacements failed" > "${bin}_busco.failed_bin.txt" - - message=\$(egrep \$'INFO:\t\\S+ selected' ${bin}_busco.log) - [[ \$message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]]; - db_name_gen="\${BASH_REMATCH[1]}" - most_spec_db=\${db_name_gen} - echo "Used generic lineage dataset: \${db_name_gen}" - cp BUSCO/auto_lineage/run_\${db_name_gen}/short_summary.txt short_summary.domain.\${db_name_gen}.${bin}.txt - - for f in BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa; do - cat BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*faa | gzip >${bin}_buscos.\${db_name_gen}.faa.gz - break - done - for f in BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna; do - cat BUSCO/auto_lineage/run_\${db_name_gen}/busco_sequences/single_copy_busco_sequences/*fna | gzip >${bin}_buscos.\${db_name_gen}.fna.gz - break - done - - else - echo "ERROR: BUSCO analysis failed for some unknown reason! See also ${bin}_busco.err." >&2 - exit 1 - fi - - # additionally output genes predicted with Prodigal (GFF3) - if [ -f BUSCO/logs/prodigal_out.log ]; then - mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff" - fi - - # if needed delete temporary BUSCO files - if [ ${busco_clean} = "Y" ]; then - find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \\; - find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \\; - find . -depth -type d -name "run_*" -execdir rm -rf "{}" + - fi + run_busco.sh "${p}" "${cp_augustus_config}" "${db}" "${bin}" ${task.cpus} "${lineage_dataset_provided}" "${busco_clean}" cat <<-END_VERSIONS > versions.yml "${task.process}": From 17971e2afdd9b2c025f52789d2b31921e057efdb Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Tue, 16 May 2023 20:09:30 +0200 Subject: [PATCH 2/4] Update CHANGELOG.md --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 216bc431..c3be7903 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#429](https://github.com/nf-core/mag/pull/429) - Replaced hardcoded CheckM database auto-download URL to a parameter (reported by @erikrikarddaniel, fix by @jfy133) - [#441](https://github.com/nf-core/mag/pull/441) - Deactivated CONCOCT in AWS 'full test' due to very long runtime (fix by @jfy133). - [#442](https://github.com/nf-core/mag/pull/442) - Remove warning when BUSCO finds no genes in bins, as this can be expected in some datasets (reported by @Lumimar, fix by @jfy133). -- [#443](https://github.com/nf-core/mag/pull/443) - Moved BUSCO bash code to script (by @jfy133) +- [#444](https://github.com/nf-core/mag/pull/444) - Moved BUSCO bash code to script (by @jfy133) ### `Fixed` From b7017c47f71620bdccfa3ff3118d7dd945ce35ce Mon Sep 17 00:00:00 2001 From: James Fellows Yates Date: Wed, 17 May 2023 11:12:37 +0200 Subject: [PATCH 3/4] Fix good catch from @prototaxites --- bin/run_busco.sh | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/bin/run_busco.sh b/bin/run_busco.sh index a1362202..a6541d31 100755 --- a/bin/run_busco.sh +++ b/bin/run_busco.sh @@ -9,13 +9,13 @@ lineage_dataset_provided=$6 busco_clean=$7 # ensure augustus has write access to config directory -if [ ${cp_augustus_config} = "Y" ] ; then +if [ ${cp_augustus_config} = "Y" ]; then cp -r /usr/local/config/ augustus_config/ export AUGUSTUS_CONFIG_PATH=augustus_config fi # place db in extra folder to ensure BUSCO recognizes it as path (instead of downloading it) -if [ ${lineage_dataset_provided} = "Y" ] ; then +if [ ${lineage_dataset_provided} = "Y" ]; then mkdir dataset mv ${db} dataset/ fi @@ -30,7 +30,7 @@ if busco ${p} \ --mode genome \ --in ${bin} \ --cpu ${task_cpus} \ - --out "BUSCO" > ${bin}_busco.log 2> ${bin}_busco.err; then + --out "BUSCO" >${bin}_busco.log 2>${bin}_busco.err; then # get name of used specific lineage dataset summaries=(BUSCO/short_summary.specific.*.BUSCO.txt) @@ -38,7 +38,7 @@ if busco ${p} \ echo "ERROR: none or multiple 'BUSCO/short_summary.specific.*.BUSCO.txt' files found. Expected one." exit 1 fi - [[ $summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]]; + [[ $summaries =~ BUSCO/short_summary.specific.(.*).BUSCO.txt ]] db_name_spec="${BASH_REMATCH[1]}" most_spec_db=${db_name_spec} echo "Used specific lineage dataset: ${db_name_spec}" @@ -48,17 +48,17 @@ if busco ${p} \ # if lineage dataset is provided, BUSCO analysis does not fail in case no genes can be found as when using the auto selection setting # report bin as failed to allow consistent warnings within the pipeline for both settings - if egrep -q $'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log ; then + if egrep -q $'WARNING:\tBUSCO did not find any match.' ${bin}_busco.log; then echo "WARNING: BUSCO could not find any genes for the provided lineage dataset! See also ${bin}_busco.log." - echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" + echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt" fi else # auto lineage selection - if { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log \ - && egrep -q $'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log ; } || \ - { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log \ - && egrep -q $'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log \ - && egrep -q $'INFO:\tUsing local lineages directory ' ${bin}_busco.log ; }; then + if { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && + egrep -q $'INFO:\tLineage \\S+ is selected, supported by ' ${bin}_busco.log; } || + { egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && + egrep -q $'INFO:\tThe results from the Prodigal gene predictor indicate that your data belongs to the mollicutes clade. Testing subclades...' ${bin}_busco.log && + egrep -q $'INFO:\tUsing local lineages directory ' ${bin}_busco.log; }; then # the second statement is necessary, because certain mollicute clades use a different genetic code, are not part of the BUSCO placement tree, are tested separately # and cause different log messages echo "Domain and specific lineage could be selected by BUSCO." @@ -71,7 +71,7 @@ if busco ${p} \ cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt db_name_gen=${db_name_spec} else - [[ $summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]]; + [[ $summaries_gen =~ BUSCO/short_summary.generic.(.*).BUSCO.txt ]] db_name_gen="${BASH_REMATCH[1]}" echo "Used generic lineage dataset: ${db_name_gen}" cp BUSCO/short_summary.generic.${db_name_gen}.BUSCO.txt short_summary.domain.${db_name_gen}.${bin}.txt @@ -86,15 +86,15 @@ if busco ${p} \ break done - elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log ; then + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNo marker genes were found. Root lineage \\S+ is kept' ${bin}_busco.log; then echo "Domain could be selected by BUSCO, but no more specific lineage." cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt - elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log ; then + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tNot enough markers were placed on the tree \\([0-9]*\\). Root lineage \\S+ is kept' ${bin}_busco.log; then echo "Domain could be selected by BUSCO, but no more specific lineage." cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.domain.${db_name_spec}.${bin}.txt - elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tRunning virus detection pipeline' ${bin}_busco.log ; then + elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'INFO:\tRunning virus detection pipeline' ${bin}_busco.log; then # TODO double-check if selected dataset is not one of bacteria_*, archaea_*, eukaryota_*? echo "Domain could not be selected by BUSCO, but virus dataset was selected." cp BUSCO/short_summary.specific.${db_name_spec}.BUSCO.txt short_summary.specific_lineage.${db_name_spec}.${bin}.txt @@ -113,16 +113,16 @@ if busco ${p} \ break done -elif egrep -q $'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err ; then +elif egrep -q $'ERROR:\tNo genes were recognized by BUSCO' ${bin}_busco.err; then echo "WARNING: BUSCO analysis failed due to no recognized genes! See also ${bin}_busco.err." - echo -e "${bin}\tNo genes" > "${bin}_busco.failed_bin.txt" + echo -e "${bin}\tNo genes" >"${bin}_busco.failed_bin.txt" -elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'ERROR:\tPlacements failed' ${bin}_busco.err ; then +elif egrep -q $'INFO:\t\\S+ selected' ${bin}_busco.log && egrep -q $'ERROR:\tPlacements failed' ${bin}_busco.err; then echo "WARNING: BUSCO analysis failed due to failed placements! See also ${bin}_busco.err. Still using results for selected generic lineage dataset." - echo -e "${bin}\tPlacements failed" > "${bin}_busco.failed_bin.txt" + echo -e "${bin}\tPlacements failed" >"${bin}_busco.failed_bin.txt" message=$(egrep $'INFO:\t\\S+ selected' ${bin}_busco.log) - [[ $message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]]; + [[ $message =~ INFO:[[:space:]]([_[:alnum:]]+)[[:space:]]selected ]] db_name_gen="${BASH_REMATCH[1]}" most_spec_db=${db_name_gen} echo "Used generic lineage dataset: ${db_name_gen}" @@ -149,7 +149,7 @@ fi # if needed delete temporary BUSCO files if [ ${busco_clean} = "Y" ]; then - find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \\; - find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \\; + find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \; + find . -depth -type d -name "auto_lineage" -execdir rm -rf "{}" \; find . -depth -type d -name "run_*" -execdir rm -rf "{}" + fi From bb4cefdf2ea7169e030cc4c37ef54a92f50d61ba Mon Sep 17 00:00:00 2001 From: Sabrina Krakau Date: Thu, 25 May 2023 11:23:14 +0200 Subject: [PATCH 4/4] Add handling of env variable most_spec_db --- bin/run_busco.sh | 3 +++ modules/local/busco.nf | 1 + 2 files changed, 4 insertions(+) diff --git a/bin/run_busco.sh b/bin/run_busco.sh index a6541d31..9e022e87 100755 --- a/bin/run_busco.sh +++ b/bin/run_busco.sh @@ -147,6 +147,9 @@ if [ -f BUSCO/logs/prodigal_out.log ]; then mv BUSCO/logs/prodigal_out.log "${bin}_prodigal.gff" fi +# output value of most_spec_db +echo ${most_spec_db} > info_most_spec_db.txt + # if needed delete temporary BUSCO files if [ ${busco_clean} = "Y" ]; then find . -depth -type d -name "augustus_config" -execdir rm -rf "{}" \; diff --git a/modules/local/busco.nf b/modules/local/busco.nf index f84e72f8..9cf4b4ce 100644 --- a/modules/local/busco.nf +++ b/modules/local/busco.nf @@ -39,6 +39,7 @@ process BUSCO { } """ run_busco.sh "${p}" "${cp_augustus_config}" "${db}" "${bin}" ${task.cpus} "${lineage_dataset_provided}" "${busco_clean}" + most_spec_db=\$( versions.yml "${task.process}":