Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patch of #534 into dev for v2.5.1 #548

Merged
merged 5 commits into from
Jan 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bin/combine_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def main(args=None):

if args.gtdbtk_summary:
gtdbtk_results = pd.read_csv(args.gtdbtk_summary, sep="\t")
if not bins.equals(gtdbtk_results["user_genome"].sort_values().reset_index(drop=True)):
if len(set(gtdbtk_results["user_genome"].to_list()).difference(set(bins))) > 0:
sys.exit("Bins in GTDB-Tk summary do not match bins in bin depths summary!")
results = pd.merge(
results, gtdbtk_results, left_on="bin", right_on="user_genome", how="outer"
Expand Down
19 changes: 10 additions & 9 deletions modules/local/cat.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@ process CAT {
tuple val(db_name), path("database/*"), path("taxonomy/*")

output:
path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification
path("*.bin2classification.names.txt.gz") , emit: tax_classification_names
path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca
path("raw/*.predicted_proteins.faa.gz") , emit: faa
path("raw/*.predicted_proteins.gff.gz") , emit: gff
path("raw/*.log") , emit: log
path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids
path "versions.yml" , emit: versions
tuple val(meta), path("*.bin2classification.names.txt") , emit: tax_classification_names
path("*.ORF2LCA.names.txt.gz") , emit: orf2lca_classification
path("raw/*.ORF2LCA.txt.gz") , emit: orf2lca
path("raw/*.predicted_proteins.faa.gz") , emit: faa
path("raw/*.predicted_proteins.gff.gz") , emit: gff
path("raw/*.log") , emit: log
path("raw/*.bin2classification.txt.gz") , emit: tax_classification_taxids
path "versions.yml" , emit: versions
Comment on lines +14 to +21
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do any of the res tof these files also need a meta? e.g. the names? Or are they not used for anything else?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

None of these files are re-used later, hence no meta for them.


script:
def official_taxonomy = params.cat_official_taxonomy ? "--only_official" : ""
Expand All @@ -31,12 +31,13 @@ process CAT {

mkdir raw
mv *.ORF2LCA.txt *.predicted_proteins.faa *.predicted_proteins.gff *.log *.bin2classification.txt raw/
cp *.bin2classification.names.txt raw/
gzip "raw/${prefix}.ORF2LCA.txt" \
"raw/${prefix}.concatenated.predicted_proteins.faa" \
"raw/${prefix}.concatenated.predicted_proteins.gff" \
"raw/${prefix}.bin2classification.txt" \
"${prefix}.ORF2LCA.names.txt" \
"${prefix}.bin2classification.names.txt"
"raw/${prefix}.bin2classification.names.txt"

cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
3 changes: 0 additions & 3 deletions modules/local/cat_summary.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,6 @@ process CAT_SUMMARY {
script:
def prefix = task.ext.prefix ?: "cat_summary"
"""
# use find as sometimes these are empty and need to fail gracefully
find -L -type f -name "*bin2classification.names.txt.gz" -exec sh -c 'for f do gunzip -c \$f > \${f%.*}; done' find-sh {} +

bioawk '(NR == 1) || (FNR > 1)' *.txt > ${prefix}.tsv

cat <<-END_VERSIONS > versions.yml
Expand Down
2 changes: 1 addition & 1 deletion modules/local/quast_bins.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process QUAST_BINS {

output:
path "QUAST/*", type: 'dir' , emit: dir
path "QUAST/*-quast_summary.tsv", emit: quast_bin_summaries
tuple val(meta), path("QUAST/*-quast_summary.tsv"), emit: quast_bin_summaries
path "versions.yml" , emit: versions

script:
Expand Down
1 change: 1 addition & 0 deletions subworkflows/local/binning_refinement.nf
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ workflow BINNING_REFINEMENT {

// Note: do not `failOnMismatch` on join here, in some cases e.g. MAXBIN2 will fail if no bins, so cannot join!
// Only want to join for DAS_Tool on bins that 'exist'

ch_input_for_dastool = ch_contigs_for_dastool.join(ch_fastatocontig2bin_for_dastool, by: 0)

ch_versions = ch_versions.mix(DASTOOL_FASTATOCONTIG2BIN_METABAT2.out.versions.first())
Expand Down
13 changes: 9 additions & 4 deletions subworkflows/local/depths.nf
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,6 @@ workflow DEPTHS {
main:
ch_versions = Channel.empty()


depths.dump(tag: 'depths', pretty: true)

// Compute bin depths for different samples (according to `binning_map_mode`)
// Create a new meta combine key first, but copy meta so that
// we retain the information about binners and domain classification
Expand Down Expand Up @@ -62,7 +59,15 @@ workflow DEPTHS {
}

MAG_DEPTHS_PLOT ( ch_mag_depths_plot, ch_sample_groups.collect() )
MAG_DEPTHS_SUMMARY ( MAG_DEPTHS.out.depths.map{it[1]}.collect() )

//Depth files that are coming from bins and failed binning refinement are concatenated per meta
ch_mag_depth_out = MAG_DEPTHS.out.depths
.collectFile(keepHeader: true) {
meta, depth ->
[meta.id, depth]
}

MAG_DEPTHS_SUMMARY ( ch_mag_depth_out.collect() )
ch_versions = ch_versions.mix( MAG_DEPTHS_PLOT.out.versions )
ch_versions = ch_versions.mix( MAG_DEPTHS_SUMMARY.out.versions )

Expand Down
9 changes: 4 additions & 5 deletions subworkflows/local/gtdbtk.nf
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ workflow GTDBTK {
def completeness = -1
def contamination = -1
def missing, duplicated
if (params.busco_db.getBaseName().contains('odb10')) {
def busco_db = file(params.busco_db)
if (busco_db.getBaseName().contains('odb10')) {
missing = row.'%Missing (specific)' // TODO or just take '%Complete'?
duplicated = row.'%Complete and duplicated (specific)'
} else {
Expand All @@ -52,7 +53,7 @@ workflow GTDBTK {
ch_filtered_bins = bins
.transpose()
.map { meta, bin -> [bin.getName(), bin, meta]}
.join(ch_bin_metrics, failOnDuplicate: true, failOnMismatch: true)
.join(ch_bin_metrics, failOnDuplicate: true)
.map { bin_name, bin, meta, completeness, contamination -> [meta, bin, completeness, contamination] }
.branch {
passed: (it[2] != -1 && it[2] >= params.gtdbtk_min_completeness && it[3] != -1 && it[3] <= params.gtdbtk_max_contamination)
Expand Down Expand Up @@ -84,11 +85,9 @@ workflow GTDBTK {

GTDBTK_SUMMARY (
ch_filtered_bins.discarded.map{it[1]}.collect().ifEmpty([]),
GTDBTK_CLASSIFYWF.out.summary.collect().ifEmpty([]),
GTDBTK_CLASSIFYWF.out.summary.map{it[1]}.collect().ifEmpty([]),
[],
// GTDBTK_CLASSIFYWF.out.filtered.collect().ifEmpty([]),
[]
// GTDBTK_CLASSIFYWF.out.failed.collect().ifEmpty([])
)

emit:
Expand Down
46 changes: 29 additions & 17 deletions workflows/mag.nf
Original file line number Diff line number Diff line change
Expand Up @@ -455,27 +455,30 @@ workflow MAG {
if ( !ch_centrifuge_db_file.isEmpty() ) {
if ( ch_centrifuge_db_file.extension in ['gz', 'tgz'] ) {
// Expects to be tar.gz!
ch_db_for_centrifuge = CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file ).db
ch_db_for_centrifuge = CENTRIFUGE_DB_PREPARATION ( ch_centrifuge_db_file )
.db
.collect()
.map{
db ->
def db_name = db[0].getBaseName().split('\\.')[0]
[ db_name, db ]
}
} else if ( ch_centrifuge_db_file.isDirectory() ) {
ch_db_for_centrifuge = Channel
.fromPath( "${ch_centrifuge_db_file}/*.cf" )
.collect()
.map{
db ->
def db_name = db[0].getBaseName().split('\\.')[0]
[ db_name, db ]
}
} else {
ch_db_for_centrifuge = Channel.empty()
}
} else {
ch_db_for_centrifuge = Channel.empty()
}

// Centrifuge val(db_name) has to be the basename of any of the
// index files up to but not including the final .1.cf
ch_db_for_centrifuge = ch_db_for_centrifuge
.collect()
.map{
db ->
def db_name = db[0].getBaseName().split('\\.')[0]
[ db_name, db ]
}

CENTRIFUGE (
ch_short_reads,
ch_db_for_centrifuge
Expand Down Expand Up @@ -578,6 +581,7 @@ workflow MAG {
}

ch_assemblies = Channel.empty()

if (!params.skip_megahit){
MEGAHIT ( ch_short_reads_grouped )
ch_megahit_assemblies = MEGAHIT.out.assembly
Expand Down Expand Up @@ -791,8 +795,6 @@ workflow MAG {
[meta_new, bins]
}



// If any two of the binners are both skipped at once, do not run because DAS_Tool needs at least one
if ( params.refine_bins_dastool ) {
ch_prokarya_bins_dastool = ch_binning_results_bins
Expand Down Expand Up @@ -829,8 +831,6 @@ workflow MAG {
} else if ( params.postbinning_input == 'refined_bins_only' ) {
ch_input_for_postbinning_bins = ch_refined_bins
ch_input_for_postbinning_bins_unbins = ch_refined_bins.mix(ch_refined_unbins)
// TODO REACTIVATE ONCE PR #489 IS READY!
// TODO RE-ADD BOTH TO SCHEMA ONCE RE-ADDING
} else if ( params.postbinning_input == 'both' ) {
ch_all_bins = ch_binning_results_bins.mix(ch_refined_bins)
ch_input_for_postbinning_bins = ch_all_bins
Expand Down Expand Up @@ -913,7 +913,12 @@ workflow MAG {

QUAST_BINS ( ch_input_for_quast_bins )
ch_versions = ch_versions.mix(QUAST_BINS.out.versions.first())
QUAST_BINS_SUMMARY ( QUAST_BINS.out.quast_bin_summaries.collect() )
ch_quast_bin_summary = QUAST_BINS.out.quast_bin_summaries
.collectFile(keepHeader: true) {
meta, summary ->
["${meta.id}.tsv", summary]
}
QUAST_BINS_SUMMARY ( ch_quast_bin_summary.collect() )
ch_quast_bins_summary = QUAST_BINS_SUMMARY.out.summary
}

Expand All @@ -932,8 +937,15 @@ workflow MAG {
ch_input_for_postbinning_bins_unbins,
ch_cat_db
)
// Group all classification results for each sample in a single file
ch_cat_summary = CAT.out.tax_classification_names
.collectFile(keepHeader: true) {
meta, classification ->
["${meta.id}.txt", classification]
}
// Group all classification results for the whole run in a single file
CAT_SUMMARY(
CAT.out.tax_classification_names.collect()
ch_cat_summary.collect()
)
ch_versions = ch_versions.mix(CAT.out.versions.first())
ch_versions = ch_versions.mix(CAT_SUMMARY.out.versions)
Expand Down
Loading