Skip to content

Commit

Permalink
migration up to starting alignment
Browse files Browse the repository at this point in the history
  • Loading branch information
toniher committed Oct 11, 2024
1 parent fbe8817 commit e61712b
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 82 deletions.
122 changes: 40 additions & 82 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ LOCAL_MODULES='./modules/local/exorthist'

include { CHECK_INPUT } from "${LOCAL_MODULES}/check_input.nf"
include { GENERATE_ANNOTATIONS } from "${LOCAL_MODULES}/generate_annotations.nf"
include { SPLIT_CLUSTERS_BY_SPECIES_PAIRS } from "${LOCAL_MODULES}/split_clusters_species.nf"
include { SPLIT_CLUSTERS_IN_CHUNKS } from "${LOCAL_MODULES}/split_clusters_chunks.nf"

/*
* Validate input and print log file
Expand Down Expand Up @@ -164,99 +166,55 @@ workflow {
GENERATE_ANNOTATIONS(data_to_annotation, file("/path/to/NO_FILE"))
}

// Review outputs below
clusters_split_ch = GENERATE_ANNOTATIONS.out.idfolders.toList().map{ [it, it].combinations().findAll{ a, b -> a[0] < b[0]} }
.flatMap()
.map { ["${it[0][0]}-${it[1][0]}".toString(), it[0][1], it[1][1]] }

CHECK_INPUT.out.run_info.view()
GENERATE_ANNOTATIONS.out.idfolders.view()

// Copy the gene cluster file to output to use for the exint_plotter and compare_exon_sets modules
SPLIT_CLUSTERS_BY_SPECIES_PAIRS(clusterfile_ch)

// Split clusters
cls_tab_files_ch = SPLIT_CLUSTERS_BY_SPECIES_PAIRS.out.cls_tab_files
SPLIT_CLUSTERS_IN_CHUNKS(cls_tab_files_ch.collect(), clusters_split_ch)

cls_files_2_align = SPLIT_CLUSTERS_IN_CHUNKS.out.cls_files_2_align
cls_files_2_align_t = cls_files_2_align.transpose().map{[it[0].getFileName().toString()+"-"+it[1].getFileName().toString(), it[0], it[1], it[2]]}

//Create a channel for the evo distances
sp1_sp2_dist = Channel
.fromPath("${params.evodists}")
.splitText()
.map{"${it}".trim().split("\t")}.map{[it[0]+"-"+it[1], it[2]]}

sp2_sp1_dist = Channel
.fromPath("${params.evodists}")
.splitText()
.map{"${it}".trim().split("\t")}.map{[it[1]+"-"+it[0], it[2]]}

species_pairs_dist = sp1_sp2_dist.concat(sp2_sp1_dist)
//Only the species pairs with a common index will be kept
dist_ranges_ch = clusters_split_ch.join(species_pairs_dist).map{[it[0], it[3]]}
alignment_input = cls_files_2_align_t.groupTuple().join(dist_ranges_ch).transpose()

// Review outputs below
CHECK_INPUT.out.run_info.view()
GENERATE_ANNOTATIONS.out.idfolders.view { "ANN: $it" }
SPLIT_CLUSTERS_BY_SPECIES_PAIRS.out.cls_tab_files.view()
SPLIT_CLUSTERS_BY_SPECIES_PAIRS.out.gene_cluster_file.view()
clusters_split_ch.view { "CL: $it" }
dist_ranges_ch.view { "DR: $it" }
alignment_input.view { "AL: $it" }
}

//
//
// /*
// * split cluster file
// */
// //Copy the gene cluster file to output to use for the exint_plotter and compare_exon_sets modules
// process split_clusters_by_species_pairs {
// tag { clusterfile }
// publishDir "${params.output}/", mode: 'copy', pattern: "gene_cluster_file.gz"
//
// input:
// file(clusterfile)
//
// output:
// file "*.cls.tab" into cls_tab_files, cls_tab_file_4_clustering
// file("gene_cluster_file.gz")
//
// script:
// """
// if [ `echo ${clusterfile} | grep ".gz"` ]; then
// zcat ${clusterfile} > gene_cluster_file
// A2_split_clusters_by_species_pairs.pl -f gene_cluster_file
// gzip gene_cluster_file
// #rm cluster_file
// else
// cat ${clusterfile} > gene_cluster_file
// #A2_split_clusters_by_species_pairs.pl -f ${clusterfile}
// A2_split_clusters_by_species_pairs.pl -f gene_cluster_file
// gzip gene_cluster_file
// fi
// """
// }
//
// idfolders
// .toList().map{ [it, it] .combinations().findAll{ a, b -> a[0] < b[0]} }
// .flatMap()
// .map { ["${it[0][0]}-${it[1][0]}".toString(), it[0][1], it[1][1]] }
// .into{cluster_2_split; anno_2_score_ex_int; species_to_recluster_genes; pairs_4_evodists; pairs_4_EXs_to_realign}
//
//
//
// /*
// * split clusters
// */
//
// process split_clusters_in_chunks {
// tag { id_comb }
//
// input:
// file(cls_tab_files).collect()
// set id_comb, file(idfolder_A), file(idfolder_B) from cluster_2_split
//
// output:
// set file(idfolder_A), file(idfolder_B), file("${idfolder_A}_${idfolder_B}/*.cls.tab-part_*") into cls_files_2_align
//
// script:
// """
// A3_split_clusters_in_chunks.pl --sp1 ${idfolder_A} --sp2 ${idfolder_B} --expath ./ --project_dir ./ --N_split ${params.alignmentnum} --gene_cluster ${id_comb}.cls.tab
// """
// }
//
// cls_files_2_align.transpose().map{[it[0].getFileName().toString()+"-"+it[1].getFileName().toString(), it[0], it[1], it[2]]}.set{cls_files_2_align_t}
//
// //Create a channel for the evo distances
// Channel
// .fromPath("${params.evodists}")
// .splitText()
// .map{"${it}".trim().split("\t")}.map{[it[0]+"-"+it[1], it[2]]}.set{sp1_sp2_dist}
//
// Channel
// .fromPath("${params.evodists}")
// .splitText()
// .map{"${it}".trim().split("\t")}.map{[it[1]+"-"+it[0], it[2]]}.set{sp2_sp1_dist}
//
// sp1_sp2_dist.concat(sp2_sp1_dist).set{species_pairs_dist}
// //Only the species pairs with a common index will be kept
// pairs_4_evodists.join(species_pairs_dist).map{[it[0], it[3]]}.into{dist_ranges_ch; dist_ranges_ch1; dist_ranges_ch2}
//
//
//
// /*
// * Align pairs
// */
// //the last argument is the protein similarity alignment.
// //if a prevaln folder is provided, the protein alignments present in each species pair subfolder will not be repeated.
//
// cls_files_2_align_t.groupTuple().join(dist_ranges_ch1).transpose().set{alignment_input}
//
// process parse_IPA_prot_aln {
// tag { "${cls_part_file}" }
Expand Down
21 changes: 21 additions & 0 deletions modules/local/exorthist/split_clusters_chunks.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
process SPLIT_CLUSTERS_IN_CHUNKS {
tag { "${idfolder_A}_${idfolder_B}" }

input:
path cls_tab_files
tuple val(id_comb), path(idfolder_A), path(idfolder_B)

output:
tuple path(idfolder_A), path(idfolder_B), path("${idfolder_A}_${idfolder_B}/*.cls.tab-part_*"), emit: cls_files_2_align

script:
"""
A3_split_clusters_in_chunks.pl \
--sp1 ${idfolder_A} \
--sp2 ${idfolder_B} \
--expath ./ \
--project_dir ./ \
--N_split ${params.alignmentnum} \
--gene_cluster ${id_comb}.cls.tab
"""
}
23 changes: 23 additions & 0 deletions modules/local/exorthist/split_clusters_species.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
process SPLIT_CLUSTERS_BY_SPECIES_PAIRS {
tag { clusterfile.name }
publishDir "${params.output}/", mode: 'copy', pattern: "gene_cluster_file.gz"

input:
path clusterfile

output:
path "*.cls.tab", emit: cls_tab_files
path "gene_cluster_file.gz", emit: gene_cluster_file

script:
"""
if [[ "${clusterfile}" == *.gz ]]; then
zcat ${clusterfile} > gene_cluster_file
else
cat ${clusterfile} > gene_cluster_file
fi
A2_split_clusters_by_species_pairs.pl -f gene_cluster_file
gzip gene_cluster_file
"""
}

0 comments on commit e61712b

Please sign in to comment.