From 437ec73d54fb2691b0675f53fe6ef144f2be9e5a Mon Sep 17 00:00:00 2001 From: Toni Hermoso Pulido Date: Wed, 16 Oct 2024 18:55:35 +0200 Subject: [PATCH] moved to subworkflow first part --- TODO.md | 3 - main.nf | 105 +++--------------- subworkflows/local/exorthist/prepare_input.nf | 92 +++++++++++++++ 3 files changed, 107 insertions(+), 93 deletions(-) create mode 100644 subworkflows/local/exorthist/prepare_input.nf diff --git a/TODO.md b/TODO.md index c78765a..ffbb841 100644 --- a/TODO.md +++ b/TODO.md @@ -1,7 +1,4 @@ -- Moving modules into modules folder - Adding nextflow_schema.json -- Review container -- Testing new DSL2 version against old one - Move parts into subworkflows for clarity if possible - Convert params.config to params.yaml - Include colors into the terminal messages diff --git a/main.nf b/main.nf index 2f55bad..55be7f1 100644 --- a/main.nf +++ b/main.nf @@ -94,6 +94,7 @@ if ( !blosumfile.exists() ) exit 1, "Missing blosum file: ${blosumfile}!" // LOCAL_MODULES='./modules/local/exorthist' +LOCAL_SUBWORKFLOWS='./subworkflows/local/exorthist' include { CHECK_INPUT } from "${LOCAL_MODULES}/check_input.nf" include { CLUSTER_EXS } from "${LOCAL_MODULES}/cluster_exons.nf" @@ -112,6 +113,9 @@ include { SCORE_EX_MATCHES } from "${LOCAL_MODULES}/score_matches.nf" include { SPLIT_CLUSTERS_IN_CHUNKS } from "${LOCAL_MODULES}/split_clusters_chunks.nf" include { SPLIT_CLUSTERS_BY_SPECIES_PAIRS } from "${LOCAL_MODULES}/split_clusters_species.nf" include { SPLIT_EX_PAIRS_TO_REALIGN } from "${LOCAL_MODULES}/split_pairs.nf" + +include { PREPARE_INPUT } from "${LOCAL_SUBWORKFLOWS}/prepare_input.nf" + /* * Validate input and print log file */ @@ -140,22 +144,8 @@ workflow { // We join channels. If no extraexons, then it's empty, so no problem data_to_annotation_raw = genomes.join(annotations) - pipe_data = data_to_annotation_raw data_to_annotation = data_to_annotation_raw.join(extraexons, remainder: true) - // Print contents of each channel - gtfs.view { "GTF file: $it" } - fastas.view { "FASTA file: $it" } - gtfs_suffix.view { "GTF suffix: $it" } - fastas_suffix.view { "FASTA suffix: $it" } - - genomes.view { "Genome file: $it" } - annotations.view { "Genome file: $it" } - extraexons.view { "Extra: $it" } - data_to_annotation_raw.view { "DAR: $it"} - data_to_annotation.view { "DA: $it"} - pipe_data.view { "PD: $it"} - evodists_ch = Channel.fromPath(params.evodists) clusterfile_ch = Channel.fromPath(params.cluster) if ( params.orthopairs ) { @@ -164,8 +154,7 @@ workflow { orthopairs_ch = file("/path/to/NO_FILE") } - - CHECK_INPUT( + PREPARE_INPUT( evodists_ch, clusterfile_ch, gtfs, @@ -174,50 +163,14 @@ workflow { fastas_suffix, params.long_dist, params.medium_dist, - params.short_dist + params.short_dist, + data_to_annotation, + params.extraexons ) - // Sic: https://nextflow-io.github.io/patterns/optional-input/ - if ( params.extraexons ) { - GENERATE_ANNOTATIONS(data_to_annotation, extraexons) - } else { - GENERATE_ANNOTATIONS(data_to_annotation, file("/path/to/NO_FILE")) - } - - clusters_split_ch = GENERATE_ANNOTATIONS.out.idfolders.toList().map{ [it, it].combinations().findAll{ a, b -> a[0] < b[0]} } - .flatMap() - .map { ["${it[0][0]}-${it[1][0]}".toString(), it[0][1], it[1][1]] } - - - // Copy the gene cluster file to output to use for the exint_plotter and compare_exon_sets modules - SPLIT_CLUSTERS_BY_SPECIES_PAIRS(clusterfile_ch) - - // Split clusters - cls_tab_files_ch = SPLIT_CLUSTERS_BY_SPECIES_PAIRS.out.cls_tab_files - SPLIT_CLUSTERS_IN_CHUNKS(cls_tab_files_ch.collect(), clusters_split_ch) - - cls_files_2_align = SPLIT_CLUSTERS_IN_CHUNKS.out.cls_files_2_align - cls_files_2_align_t = cls_files_2_align.transpose().map{[it[0].getFileName().toString()+"-"+it[1].getFileName().toString(), it[0], it[1], it[2]]} - - //Create a channel for the evo distances - sp1_sp2_dist = Channel - .fromPath("${params.evodists}") - .splitText() - .map{"${it}".trim().split("\t")}.map{[it[0]+"-"+it[1], it[2]]} - - sp2_sp1_dist = Channel - .fromPath("${params.evodists}") - .splitText() - .map{"${it}".trim().split("\t")}.map{[it[1]+"-"+it[0], it[2]]} - - species_pairs_dist = sp1_sp2_dist.concat(sp2_sp1_dist) - //Only the species pairs with a common index will be kept - dist_ranges_ch = clusters_split_ch.join(species_pairs_dist).map{[it[0], it[3]]} - alignment_input = cls_files_2_align_t.groupTuple().join(dist_ranges_ch).transpose() - - //the last argument is the protein similarity alignment. - //if a prevaln folder is provided, the protein alignments present in each species pair subfolder will not be repeated. - PARSE_IPA_PROT_ALN(blosumfile, alignment_input) + // the last argument is the protein similarity alignment. + // if a prevaln folder is provided, the protein alignments present in each species pair subfolder will not be repeated. + PARSE_IPA_PROT_ALN(blosumfile, PREPARE_INPUT.out.alignment_input) // Collapse EXs_to_split in batches of 500 files EXs_to_split = PARSE_IPA_PROT_ALN.out.EXs_to_split @@ -226,7 +179,7 @@ workflow { SPLIT_EX_PAIRS_TO_REALIGN(EXs_to_split_batches) EXs_to_realign_batches = SPLIT_EX_PAIRS_TO_REALIGN.out.EXs_to_realign_batches // Flatten the results from the previous batch run and combine with sp1 and sp2 information, using sp1-sp2 as key. - EXs_to_realign = EXs_to_realign_batches.flatten().map{[it.getName().toString().split("_")[0],it]}.groupTuple().join(clusters_split_ch).transpose() + EXs_to_realign = EXs_to_realign_batches.flatten().map{[it.getName().toString().split("_")[0],it]}.groupTuple().join(PREPARE_INPUT.out.clusters_split_ch).transpose() // Realign exons pairs (with multiple hits) REALIGN_EX_PAIRS(blosumfile, EXs_to_realign) // Combine all the aln_info with the realigned_exon_info for each species pair @@ -236,12 +189,12 @@ workflow { // Merge alignments information MERGE_PROT_EX_INT_ALN_INFO(data_4_merge) folder_jscores = MERGE_PROT_EX_INT_ALN_INFO.out.folder_jscores - data_to_score = folder_jscores.join(clusters_split_ch).map{ [it[0], it[1..-1] ]} + data_to_score = folder_jscores.join(PREPARE_INPUT.out.clusters_split_ch).map{ [it[0], it[1..-1] ]} // Score EX matches from aln info SCORE_EX_MATCHES(data_to_score) // Filter the best matches above score cutoffs by target gene. all_scores_to_filt_ch = SCORE_EX_MATCHES.out.all_scores_to_filt - FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE(all_scores_to_filt_ch.join(dist_ranges_ch)) + FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE(all_scores_to_filt_ch.join(PREPARE_INPUT.out.dist_ranges_ch)) // Join filtered scored EX matches filterscore_per_joining_ch = FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining JOIN_FILTERED_EX_MATCHES(filterscore_per_joining_ch.collect()) @@ -265,7 +218,7 @@ workflow { // Re-clustering of genes RECLUSTER_GENES_BY_SPECIES_PAIR( - clusters_split_ch, + PREPARE_INPUT.out.clusters_split_ch, clusterfile_ch, orthopairs_ch ) @@ -278,34 +231,6 @@ workflow { orthopairs_ch ) - // // Review outputs below - CHECK_INPUT.out.run_info.view() - GENERATE_ANNOTATIONS.out.idfolders.view { "ANN: $it" } - SPLIT_CLUSTERS_BY_SPECIES_PAIRS.out.cls_tab_files.view() - SPLIT_CLUSTERS_BY_SPECIES_PAIRS.out.gene_cluster_file.view() - clusters_split_ch.view { "CL: $it" } - dist_ranges_ch.view { "DR: $it" } - alignment_input.view { "AL: $it" } - PARSE_IPA_PROT_ALN.out.aligned_subclusters_4_splitting.view { "SC: $it" } - PARSE_IPA_PROT_ALN.out.EXs_to_split.view { "EX: $it" } - EXs_to_realign.view { "EXR: $it" } - REALIGN_EX_PAIRS.out.realigned_exons_4_merge.view{ "RER: $it" } - MERGE_PROT_EX_INT_ALN_INFO.out.folder_jscores.view() - MERGE_PROT_EX_INT_ALN_INFO.out.aln_features.view() - MERGE_PROT_EX_INT_ALN_INFO.out.exint_aln.view() - SCORE_EX_MATCHES.out.all_features.view() - SCORE_EX_MATCHES.out.all_scores_to_filt.view() - FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.filterscore_per_joining.view() - FILTER_AND_SELECT_BEST_EX_MATCHES_BY_TARGETGENE.out.best_scored_matches.view() - JOIN_FILTERED_EX_MATCHES.out.filtered_all_scores.view() - COLLAPSE_OVERLAPPING_MATCHES.out.score_exon_hits_pairs.view() - COLLAPSE_OVERLAPPING_MATCHES.out.overlapping_exs.view() - FORMAT_EX_CLUSTERS_INPUT.out.cluster_parts.view() - CLUSTER_EXS.out.unclustered_exs.view() - CLUSTER_EXS.out.ex_clusters.view() - FORMAT_EX_CLUSTERS_OUTPUT.out.exon_cluster_for_reclustering.view() - RECLUSTER_GENES_BY_SPECIES_PAIR.out.recl_genes_for_rec_exons.view() - RECLUSTER_EXS_BY_SPECIES_PAIR.out.recl_exs.view() } workflow.onComplete { diff --git a/subworkflows/local/exorthist/prepare_input.nf b/subworkflows/local/exorthist/prepare_input.nf new file mode 100644 index 0000000..b508703 --- /dev/null +++ b/subworkflows/local/exorthist/prepare_input.nf @@ -0,0 +1,92 @@ +LOCAL_MODULES='../../../modules/local/exorthist' + +include { CHECK_INPUT } from "${LOCAL_MODULES}/check_input.nf" +include { GENERATE_ANNOTATIONS } from "${LOCAL_MODULES}/generate_annotations.nf" +include { SPLIT_CLUSTERS_IN_CHUNKS } from "${LOCAL_MODULES}/split_clusters_chunks.nf" +include { SPLIT_CLUSTERS_BY_SPECIES_PAIRS } from "${LOCAL_MODULES}/split_clusters_species.nf" + +workflow PREPARE_INPUT { + + take: + evodists_ch + clusterfile_ch + gtfs + fastas + gtfs_suffix + fastas_suffix + long_dist + medium_dist + short_dist + data_to_annotation + extraexons + + main: + + // Print contents of each channel + gtfs.view { "GTF file: $it" } + fastas.view { "FASTA file: $it" } + gtfs_suffix.view { "GTF suffix: $it" } + fastas_suffix.view { "FASTA suffix: $it" } + data_to_annotation.view { "Data to annotation: $it" } + + extraexons_ch = params.extraexons ? + Channel.fromFilePairs(params.extraexons, checkIfExists: true, size: 1) + .ifEmpty { error "Extra exons not found" } : + Channel.empty() + + + CHECK_INPUT( + evodists_ch, + clusterfile_ch, + gtfs, + fastas, + gtfs_suffix, + fastas_suffix, + long_dist, + medium_dist, + short_dist + ) + + // Sic: https://nextflow-io.github.io/patterns/optional-input/ + if ( extraexons ) { + GENERATE_ANNOTATIONS(data_to_annotation, extraexons_ch) + } else { + GENERATE_ANNOTATIONS(data_to_annotation, file("/path/to/NO_FILE")) + } + + clusters_split_ch = GENERATE_ANNOTATIONS.out.idfolders.toList().map{ [it, it].combinations().findAll{ a, b -> a[0] < b[0]} } + .flatMap() + .map { ["${it[0][0]}-${it[1][0]}".toString(), it[0][1], it[1][1]] } + + + // Copy the gene cluster file to output to use for the exint_plotter and compare_exon_sets modules + SPLIT_CLUSTERS_BY_SPECIES_PAIRS(clusterfile_ch) + + // Split clusters + cls_tab_files_ch = SPLIT_CLUSTERS_BY_SPECIES_PAIRS.out.cls_tab_files + SPLIT_CLUSTERS_IN_CHUNKS(cls_tab_files_ch.collect(), clusters_split_ch) + + cls_files_2_align = SPLIT_CLUSTERS_IN_CHUNKS.out.cls_files_2_align + cls_files_2_align_t = cls_files_2_align.transpose().map{[it[0].getFileName().toString()+"-"+it[1].getFileName().toString(), it[0], it[1], it[2]]} + + //Create a channel for the evo distances + sp1_sp2_dist = Channel + .fromPath("${params.evodists}") + .splitText() + .map{"${it}".trim().split("\t")}.map{[it[0]+"-"+it[1], it[2]]} + + sp2_sp1_dist = Channel + .fromPath("${params.evodists}") + .splitText() + .map{"${it}".trim().split("\t")}.map{[it[1]+"-"+it[0], it[2]]} + + species_pairs_dist = sp1_sp2_dist.concat(sp2_sp1_dist) + //Only the species pairs with a common index will be kept + dist_ranges_ch = clusters_split_ch.join(species_pairs_dist).map{[it[0], it[3]]} + alignment_input = cls_files_2_align_t.groupTuple().join(dist_ranges_ch).transpose() + + emit: + clusters_split_ch + dist_ranges_ch + alignment_input +}