diff --git a/.gitignore b/.gitignore index 5124c9ac..cc5e3714 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,5 @@ results/ testing/ testing* *.pyc +*.swp +runner/ diff --git a/.prettierignore b/.prettierignore index 437d763d..31c34ca9 100644 --- a/.prettierignore +++ b/.prettierignore @@ -10,3 +10,4 @@ testing/ testing* *.pyc bin/ +runner/ diff --git a/bin/ChromHMM.jar b/bin/ChromHMM.jar new file mode 100755 index 00000000..b0a6634d Binary files /dev/null and b/bin/ChromHMM.jar differ diff --git a/bin/combine_tables.py b/bin/combine_tables.py index 4ed8f8eb..a29c9876 100755 --- a/bin/combine_tables.py +++ b/bin/combine_tables.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import argparse import numpy as np diff --git a/bin/get_chromhmm_results.py b/bin/get_chromhmm_results.py new file mode 100755 index 00000000..d2793c7f --- /dev/null +++ b/bin/get_chromhmm_results.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +import argparse +import pandas as pd +import numpy as np + + +parser = argparse.ArgumentParser(description="Process ChromHMM output into bed file of predicted enhancers") + +parser.add_argument("-e", "--emissions", type=str, required=True, help="Path to emission file") +parser.add_argument("-b", "--bed", type=str, required=True, help="Path to bed file") +parser.add_argument("-t", "--threshold", type=float, required=False, default=0.9, help="Threshold for state emissions") +parser.add_argument("-m", "--markers", nargs='+', required=False, default=["H3K27ac", "H3K4me3"], help="ChIP-Seq markers that indicate an enhancer") +parser.add_argument("-o", "--output", type=str, required=True, help="Path to output bed with enhancer positions") + +args = parser.parse_args() + +path_emissions = args.emissions +path_bed = args.bed +threshold = args.threshold +markers = args.markers +output = args.output + + +# Read emissions file for the provided markers +emissions = pd.read_csv(path_emissions, sep = "\t")[["State (Emission order)"] + markers].rename(columns={"State (Emission order)": "State"}) + + +# Read input bed file and remove unecessary columns +bed = pd.read_csv(path_bed, + sep="\t", + skiprows=1, + names=["chr", "start", "end", "state", "score", "strand", "start_1", "end_1", "rgb"] + ).drop(columns=["strand", "score", "start_1", "end_1", "rgb"]) + + +# Keep state if any of the markers is enriched > threshold for this state +states = emissions[np.any([emissions[marker] >= threshold for marker in markers], axis=0)]["State"].tolist() + + +# Subset bed file for selected states +out_bed = bed[np.isin(bed["state"], states)].drop(columns=["state"]) + +# Write output +out_bed.to_csv(output, index=False, sep="\t", header=False) + diff --git a/bin/make_cellmarkfiletable.py b/bin/make_cellmarkfiletable.py new file mode 100755 index 00000000..43db0e0d --- /dev/null +++ b/bin/make_cellmarkfiletable.py @@ -0,0 +1,23 @@ +#!/usr/bin/env python3 +# coding: utf-8 + +import os +import argparse +import pandas as pd + + +# Creates a cellmarkfiletable which is needed as input for ChromHMM +parser = argparse.ArgumentParser(description = "Script to remove full paths of input file to fit into nextflow workflow") +parser.add_argument("--input", help = "Input directory", required = True, type = str) +parser.add_argument("--output", help = "path for output file", required = True, type = str) + +args = parser.parse_args() + +input = args.input +output = args.output + +table = pd.read_csv(input, sep = "\t", names=["state", "assay", "bam", "control"]) + +table["bam"] = [os.path.basename(path) for path in table["bam"]] +table["control"] = [os.path.basename(path) for path in table["control"]] +table.to_csv(output, header=False, sep="\t", index=False) diff --git a/bin/reformat_bam.sh b/bin/reformat_bam.sh new file mode 100755 index 00000000..020c7651 --- /dev/null +++ b/bin/reformat_bam.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +# Reheaders a bam file and adds 'chr' to each chromosome +# $1 is the input bam +# $2 is the output bam + +samtools view -H $1 | \ + sed -e 's/SN:1/SN:chr1/' | sed -e 's/SN:2/SN:chr2/' | \ + sed -e 's/SN:3/SN:chr3/' | sed -e 's/SN:4/SN:chr4/' | \ + sed -e 's/SN:5/SN:chr5/' | sed -e 's/SN:6/SN:chr6/' | \ + sed -e 's/SN:7/SN:chr7/' | sed -e 's/SN:8/SN:chr8/' | \ + sed -e 's/SN:9/SN:chr9/' | sed -e 's/SN:10/SN:chr10/' | \ + sed -e 's/SN:11/SN:chr11/' | sed -e 's/SN:12/SN:chr12/' | \ + sed -e 's/SN:13/SN:chr13/' | sed -e 's/SN:14/SN:chr14/' | \ + sed -e 's/SN:15/SN:chr15/' | sed -e 's/SN:16/SN:chr16/' | \ + sed -e 's/SN:17/SN:chr17/' | sed -e 's/SN:18/SN:chr18/' | \ + sed -e 's/SN:19/SN:chr19/' | sed -e 's/SN:20/SN:chr20/' | \ + sed -e 's/SN:21/SN:chr21/' | sed -e 's/SN:22/SN:chr22/' | \ + sed -e 's/SN:X/SN:chrX/' | sed -e 's/SN:Y/SN:chrY/' | \ + sed -e 's/SN:MT/SN:chrM/' | samtools reheader - $1 > $2 + diff --git a/bin/rose.py b/bin/rose.py new file mode 100755 index 00000000..9740ece4 --- /dev/null +++ b/bin/rose.py @@ -0,0 +1,666 @@ +#!/usr/bin/env python3 + +import os +from optparse import OptionParser + + +def region_stitching(input_gff, stitch_window, tss_window, annot_file, remove_tss=True): + print('Performing region stitching...') + # first have to turn bound region file into a locus collection + + # need to make sure this names correctly... each region should have a unique name + bound_collection = gff_to_locus_collection(input_gff) + + # filter out all bound regions that overlap the TSS of an ACTIVE GENE + if remove_tss: + # first make a locus collection of TSS + start_dict = make_start_dict(annot_file) + + # now makeTSS loci for active genes + remove_ticker = 0 + # this loop makes a locus centered around +/- tss_window of transcribed genes + # then adds it to the list tss_loci + tss_loci = [] + for gene_id in list(start_dict.keys()): + tss_loci.append(make_tss_locus(gene_id, start_dict, tss_window, tss_window)) + + # this turns the tss_loci list into a LocusCollection + # 50 is the internal parameter for LocusCollection and doesn't really matter + tss_collection = LocusCollection(tss_loci, 50) + + # gives all the loci in bound_collection + bound_loci = bound_collection.get_loci() + + # this loop will check if each bound region is contained by the TSS exclusion zone + # this will drop out a lot of the promoter only regions that are tiny + # typical exclusion window is around 2kb + for locus in bound_loci: + if len(tss_collection.get_containers(locus, 'both')) > 0: + # if true, the bound locus overlaps an active gene + bound_collection.remove(locus) + remove_ticker += 1 + print(f'Removed {remove_ticker} loci because they were contained by a TSS') + + # bound_collection is now all enriched region loci that don't overlap an active TSS + stitched_collection = bound_collection.stitch_collection(stitch_window, 'both') + + if remove_tss: + # now replace any stitched region that overlap 2 distinct genes + # with the original loci that were there + fixed_loci = [] + tss_loci = [] + for gene_id in list(start_dict.keys()): + tss_loci.append(make_tss_locus(gene_id, start_dict, 50, 50)) + + # this turns the tss_loci list into a LocusCollection + # 50 is the internal parameter for LocusCollection and doesn't really matter + tss_collection = LocusCollection(tss_loci, 50) + remove_ticker = 0 + original_ticker = 0 + for stitched_locus in stitched_collection.get_loci(): + overlapping_tss_loci = tss_collection.get_overlap(stitched_locus, 'both') + tss_names = [start_dict[tssLocus.id()]['name'] for tssLocus in overlapping_tss_loci] + tss_names = uniquify(tss_names) + if len(tss_names) > 2: + original_loci = bound_collection.get_overlap(stitched_locus, 'both') + original_ticker += len(original_loci) + fixed_loci += original_loci + remove_ticker += 1 + else: + fixed_loci.append(stitched_locus) + + print(f'Removed {remove_ticker} stitched loci because they overlapped multiple TSSs') + print(f'Added back {original_ticker} original loci') + fixed_collection = LocusCollection(fixed_loci, 50) + return fixed_collection + else: + return stitched_collection + + +# ================================================================== +# ==========================I/O FUNCTIONS=========================== +# ================================================================== + +# unparse_table 4/14/08 +# takes in a table generated by parse_table and writes it to an output file +# takes as parameters (table, output, sep), where sep is how the file is delimited +# example call unparse_table(table, 'table.txt', '\t') for a tab del file + +def unparse_table(table, output, sep): + fh_out = open(output, 'w') + if len(sep) == 0: + for i in table: + fh_out.write(str(i)) + fh_out.write('\n') + else: + for line in table: + line = [str(x) for x in line] + line = sep.join(line) + + fh_out.write(line) + fh_out.write('\n') + + fh_out.close() + + +# parse_table 4/14/08 +# takes in a table where columns are separated by a given symbol and outputs +# a nested list such that list[row][col] +# example call: +# table = parse_table('file.txt','\t') +def parse_table(fn, sep, header=False, excel=False): + fh = open(fn) + lines = fh.readlines() + fh.close() + if excel: + lines = lines[0].split('\r') + if lines[0].count('\r') > 0: + lines = lines[0].split('\r') + table = [] + if header: + lines = lines[1:] + for i in lines: + table.append(i[:-1].split(sep)) + + return table + + +def format_folder(folder_name, create=False): + """ + makes sure a folder exists and if not makes it + returns a bool for folder + """ + + if folder_name[-1] != '/': + folder_name += '/' + + try: + foo = os.listdir(folder_name) + return folder_name + except OSError: + print(f'folder {folder_name} does not exist') + if create: + os.system(f'mkdir {folder_name}') + return folder_name + else: + + return False + + # ================================================================== + + +# ===================ANNOTATION FUNCTIONS=========================== +# ================================================================== + + +def make_start_dict(annot_file, gene_list=[]): + """ + makes a dictionary keyed by refseq ID that contains information about + chrom/start/stop/strand/common name + """ + + if type(gene_list) == str: + gene_list = parse_table(gene_list, '\t') + gene_list = [line[0] for line in gene_list] + + if annot_file.upper().count('REFSEQ') == 1: + refseq_table, refseq_dict = import_refseq(annot_file) + if len(gene_list) == 0: + gene_list = list(refseq_dict.keys()) + start_dict = {} + for gene in gene_list: + if gene not in refseq_dict: + continue + start_dict[gene] = {} + start_dict[gene]['sense'] = refseq_table[refseq_dict[gene][0]][3] + start_dict[gene]['chr'] = refseq_table[refseq_dict[gene][0]][2] + start_dict[gene]['start'] = get_tsss([gene], refseq_table, refseq_dict) + if start_dict[gene]['sense'] == '+': + start_dict[gene]['end'] = [int(refseq_table[refseq_dict[gene][0]][5])] + else: + start_dict[gene]['end'] = [int(refseq_table[refseq_dict[gene][0]][4])] + start_dict[gene]['name'] = refseq_table[refseq_dict[gene][0]][12] + return start_dict + + +# generic function to get the TSS of any gene +def get_tsss(gene_list, refseq_table, refseq_dict): + if len(gene_list) == 0: + refseq = refseq_table + else: + refseq = refseq_from_key(gene_list, refseq_dict, refseq_table) + tss = [] + for line in refseq: + if line[3] == '+': + tss.append(line[4]) + if line[3] == '-': + tss.append(line[5]) + tss = list(map(int, tss)) + + return tss + + +# 12/29/08 +# refseq_from_key(refseqKeyList,refseq_dict,refseq_table) +# function that grabs refseq lines from refseq IDs +def refseq_from_key(refseq_key_list, refseq_dict, refseq_table): + type_refseq = [] + for name in refseq_key_list: + if name in refseq_dict: + type_refseq.append(refseq_table[refseq_dict[name][0]]) + return type_refseq + + +# 10/13/08 +# import_refseq +# takes in a refseq table and makes a refseq table and a refseq dictionary for keying the table + +def import_refseq(refseq_file, return_multiples=False): + """ + opens up a refseq file downloaded by UCSC + """ + refseq_table = parse_table(refseq_file, '\t') + refseq_dict = {} + ticker = 1 + for line in refseq_table[1:]: + if line[1] in refseq_dict: + refseq_dict[line[1]].append(ticker) + else: + refseq_dict[line[1]] = [ticker] + ticker = ticker + 1 + + multiples = [] + for i in refseq_dict: + if len(refseq_dict[i]) > 1: + multiples.append(i) + + if return_multiples: + return refseq_table, refseq_dict, multiples + else: + return refseq_table, refseq_dict + + +# ================================================================== +# ========================LOCUS INSTANCE============================ +# ================================================================== + +# Locus and LocusCollection instances courtesy of Graham Ruby + + +class Locus: + # this may save some space by reducing the number of chromosome strings + # that are associated with Locus instances (see __init__). + __chrDict = dict() + __senseDict = {'+': '+', '-': '-', '.': '.'} + + # chr = chromosome name (string) + # sense = '+' or '-' (or '.' for an ambidextrous locus) + # start,end = ints of the start and end coords of the locus + # end coord is the coord of the last nucleotide. + def __init__(self, chr, start, end, sense, id=''): + coords = [int(start), int(end)] + coords.sort() + # this method for assigning chromosome should help avoid storage of + # redundant strings. + if chr not in self.__chrDict: + self.__chrDict[chr] = chr + self._chr = self.__chrDict[chr] + self._sense = self.__senseDict[sense] + self._start = int(coords[0]) + self._end = int(coords[1]) + self._id = id + + def id(self): + return self._id + + def chr(self): + return self._chr + + def start(self): + return self._start # returns the smallest coordinate + + def end(self): + return self._end # returns the biggest coordinate + + def len(self): + return self._end - self._start + 1 + + def get_antisense_locus(self): + if self._sense == '.': + return self + else: + switch = {'+': '-', '-': '+'} + return Locus(self._chr, self._start, self._end, switch[self._sense]) + + def coords(self): + return [self._start, self._end] # returns a sorted list of the coordinates + + def sense(self): + return self._sense + + # returns boolean; True if two loci share any coordinates in common + def overlaps(self, other_locus): + if self.chr() != other_locus.chr(): + return False + elif not (self._sense == '.' or other_locus.sense() == '.' or self.sense() == other_locus.sense()): + return False + elif self.start() > other_locus.end() or other_locus.start() > self.end(): + return False + else: + return True + + # returns boolean; True if all the nucleotides of the given locus overlap + # with the self locus + def contains(self, other_locus): + if self.chr() != other_locus.chr(): + return False + elif not (self._sense == '.' or other_locus.sense() == '.' or self.sense() == other_locus.sense()): + return False + elif self.start() > other_locus.start() or other_locus.end() > self.end(): + return False + else: + return True + + # same as overlaps, but considers the opposite strand + def overlaps_antisense(self, other_locus): + return self.get_antisense_locus().overlaps(other_locus) + + # same as contains, but considers the opposite strand + def contains_antisense(self, other_locus): + return self.get_antisense_locus().contains(other_locus) + + def __hash__(self): + return self._start + self._end + + def __eq__(self, other): + if self.__class__ != other.__class__: + return False + if self.chr() != other.chr(): + return False + if self.start() != other.start(): + return False + if self.end() != other.end(): + return False + if self.sense() != other.sense(): + return False + return True + + def __ne__(self, other): + return not (self.__eq__(other)) + + def __str__(self): + return self.chr() + '(' + self.sense() + '):' + '-'.join(map(str, self.coords())) + + def check_rep(self): + pass + + +class LocusCollection: + def __init__(self, loci, window_size): + self.__chr_to_coord_to_loci = dict() + self.__loci = dict() + self.__win_size = window_size + for lcs in loci: + self.__add_locus(lcs) + + def __add_locus(self, lcs): + if lcs not in self.__loci: + self.__loci[lcs] = None + if lcs.sense() == '.': + chr_key_list = [lcs.chr() + '+', lcs.chr() + '-'] + else: + chr_key_list = [lcs.chr() + lcs.sense()] + for chr_key in chr_key_list: + if chr_key not in self.__chr_to_coord_to_loci: + self.__chr_to_coord_to_loci[chr_key] = dict() + for n in self.__get_key_range(lcs): + if n not in self.__chr_to_coord_to_loci[chr_key]: + self.__chr_to_coord_to_loci[chr_key][n] = [] + self.__chr_to_coord_to_loci[chr_key][n].append(lcs) + + def __get_key_range(self, locus): + start = locus.start() // self.__win_size + # add 1 because of the range + end = locus.end() // self.__win_size + 1 + return range(start, end) + + def __len__(self): + return len(self.__loci) + + def append(self, new): + self.__add_locus(new) + + def extend(self, new_list): + for lcs in new_list: + self.__add_locus(lcs) + + def has_locus(self, locus): + return locus in self.__loci + + def remove(self, old): + if old not in self.__loci: + raise ValueError("requested locus isn't in collection") + del self.__loci[old] + if old.sense() == '.': + sense_list = ['+', '-'] + else: + sense_list = [old.sense()] + for k in self.__get_key_range(old): + for sense in sense_list: + self.__chr_to_coord_to_loci[old.chr() + sense][k].remove(old) + + def get_window_size(self): + return self.__win_size + + def get_loci(self): + return list(self.__loci.keys()) + + def get_chr_list(self): + # i need to remove the strand info from the chromosome keys and make + # them non-redundant. + temp_keys = dict() + for k in list(self.__chr_to_coord_to_loci.keys()): + temp_keys[k[:-1]] = None + return list(temp_keys.keys()) + + def __subset_helper(self, locus, sense): + sense = sense.lower() + if ['sense', 'antisense', 'both'].count(sense) != 1: + raise ValueError("sense command invalid: '" + sense + "'.") + matches = dict() + senses = ['+', '-'] + if locus.sense() == '.' or sense == 'both': + lamb = lambda s: True + elif sense == 'sense': + lamb = lambda s: s == locus.sense() + elif sense == 'antisense': + lamb = lambda s: s != locus.sense() + else: + raise ValueError("sense value was inappropriate: '" + sense + "'.") + for s in filter(lamb, senses): + chr_key = locus.chr() + s + if chr_key in self.__chr_to_coord_to_loci: + for n in self.__get_key_range(locus): + if n in self.__chr_to_coord_to_loci[chr_key]: + for lcs in self.__chr_to_coord_to_loci[chr_key][n]: + matches[lcs] = None + return list(matches.keys()) + + # sense can be 'sense' (default), 'antisense', or 'both' + # returns all members of the collection that overlap the locus + def get_overlap(self, locus, sense='sense'): + matches = self.__subset_helper(locus, sense) + # now, get rid of the ones that don't really overlap + real_matches = dict() + if sense == 'sense' or sense == 'both': + for i in [lcs for lcs in matches if lcs.overlaps(locus)]: + real_matches[i] = None + if sense == 'antisense' or sense == 'both': + for i in [lcs for lcs in matches if lcs.overlaps_antisense(locus)]: + real_matches[i] = None + return list(real_matches.keys()) + + # sense can be 'sense' (default), 'antisense', or 'both' + # returns all members of the collection that are contained by the locus + def get_contained(self, locus, sense='sense'): + matches = self.__subset_helper(locus, sense) + # now, get rid of the ones that don't really overlap + real_matches = dict() + if sense == 'sense' or sense == 'both': + for i in [lcs for lcs in matches if locus.contains(lcs)]: + real_matches[i] = None + if sense == 'antisense' or sense == 'both': + for i in [lcs for lcs in matches if locus.contains_antisense(lcs)]: + real_matches[i] = None + return list(real_matches.keys()) + + # sense can be 'sense' (default), 'antisense', or 'both' + # returns all members of the collection that contain the locus + def get_containers(self, locus, sense='sense'): + matches = self.__subset_helper(locus, sense) + # now, get rid of the ones that don't really overlap + real_matches = dict() + if sense == 'sense' or sense == 'both': + for i in [lcs for lcs in matches if lcs.contains(locus)]: + real_matches[i] = None + if sense == 'antisense' or sense == 'both': + for i in [lcs for lcs in matches if lcs.contains_antisense(locus)]: + real_matches[i] = None + return list(real_matches.keys()) + + def stitch_collection(self, stitch_window=1, sense='both'): + + """ + reduces the collection by stitching together overlapping loci + returns a new collection + """ + + # initializing stitch_window to 1 + # this helps collect directly adjacent loci + + locus_list = self.get_loci() + old_collection = LocusCollection(locus_list, 500) + + stitched_collection = LocusCollection([], 500) + + for locus in locus_list: + if old_collection.has_locus(locus): + old_collection.remove(locus) + overlapping_loci = old_collection.get_overlap( + Locus(locus.chr(), locus.start() - stitch_window, locus.end() + stitch_window, locus.sense(), + locus.id()), sense) + + stitch_ticker = 1 + while len(overlapping_loci) > 0: + stitch_ticker += len(overlapping_loci) + overlap_coords = locus.coords() + + for overlapping_locus in overlapping_loci: + overlap_coords += overlapping_locus.coords() + old_collection.remove(overlapping_locus) + if sense == 'both': + locus = Locus(locus.chr(), min(overlap_coords), max(overlap_coords), '.', locus.id()) + else: + locus = Locus(locus.chr(), min(overlap_coords), max(overlap_coords), locus.sense(), locus.id()) + overlapping_loci = old_collection.get_overlap( + Locus(locus.chr(), locus.start() - stitch_window, locus.end() + stitch_window, locus.sense()), + sense) + locus._id = f'{stitch_ticker}_{locus.id()}_lociStitched' + + stitched_collection.append(locus) + + else: + continue + return stitched_collection + + +# ================================================================== +# ========================LOCUS FUNCTIONS=========================== +# ================================================================== +# 06/11/09 +# turns a locusCollection into a gff +# does not write to disk though +def locus_collection_to_gff(locus_collection): + loci_list = locus_collection.get_loci() + gff = [] + for locus in loci_list: + new_line = [locus.chr(), locus.id(), '', locus.coords()[0], locus.coords()[1], '', locus.sense(), '', + locus.id()] + gff.append(new_line) + return gff + + +def gff_to_locus_collection(gff, window=500): + """ + opens up a gff file and turns it into a LocusCollection instance + """ + + loci_list = [] + if type(gff) == str: + gff = parse_table(gff, '\t') + + for line in gff: + # USE line[2] as the locus id. If that is empty use line[8] + if len(line[2]) > 0: + name = line[2] + elif len(line[8]) > 0: + name = line[8] + else: + name = f'{line[0]}:{line[6]}:{line[3]}-{line[4]}' + + loci_list.append(Locus(line[0], line[3], line[4], line[6], name)) + return LocusCollection(loci_list, window) + + +def make_tss_locus(gene, start_dict, upstream, downstream): + """ + given a start_dict, make a locus for any gene's TSS w/ upstream and downstream windows + """ + + start = start_dict[gene]['start'][0] + if start_dict[gene]['sense'] == '-': + return Locus(start_dict[gene]['chr'], start - downstream, start + upstream, '-', gene) + else: + return Locus(start_dict[gene]['chr'], start - upstream, start + downstream, '+', gene) + + +# ================================================================== +# ========================MISC FUNCTIONS============================ +# ================================================================== + + +# uniquify function +# by Peter Bengtsson +# Used under a creative commons license +# sourced from here: http://www.peterbe.com/plog/uniqifiers-benchmark + +def uniquify(seq, idfun=None): + # order preserving + if idfun is None: + def idfun(x): return x + seen = {} + result = [] + for item in seq: + marker = idfun(item) + # in old Python versions: + # if seen.has_key(marker) + # but in new ones: + if marker in seen: continue + seen[marker] = 1 + result.append(item) + return result + + + + +def main(): + parser = OptionParser(usage="usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -o [OUTPUT_FOLDER]") + + parser.add_option("-i", "--i", dest="input", nargs=1, default=None, + help="Enter a .gff or .bed file of binding sites used to make enhancers") + parser.add_option("-o", "--out", dest="out", nargs=1, default=None, + help="Enter an output folder") + parser.add_option("-g", "--genome", dest="genome", default=None, + help="Enter the genome build (MM9,MM8,HG18,HG19,HG38)") + parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default=12500, + help="Enter a max linking distance for stitching") + parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, + help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") + + options, args = parser.parse_args() + + input_gff_file = options.input + + stitch_window = int(options.stitch) + + tss_window = int(options.tss) + if tss_window != 0: + remove_tss = True + else: + remove_tss = False + + # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS + print(f'Using {input_gff_file} as the input gff') + input_name = input_gff_file.split('/')[-1].split('.')[0] + + # Get annotation file + annot_file = options.genome + print(f'Using {annot_file} as the genome') + + print('Making start dict') + start_dict = make_start_dict(annot_file) + + print('Stitching regions together') + stitched_collection = region_stitching(input_gff_file, stitch_window, tss_window, annot_file, remove_tss) + + print('Making GFF from stitched collection') + stitched_gff = locus_collection_to_gff(stitched_collection) + + stitched_gff_file = options.out + + print(f'Writing stitched GFF to disk as {stitched_gff_file}') + unparse_table(stitched_gff, stitched_gff_file, '\t') + + +if __name__ == '__main__': + main() diff --git a/conf/igenomes.config b/conf/igenomes.config index 4e6541c9..2f225682 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -32,6 +32,8 @@ params { "150" : 2824648687, "200" : 2848794782 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/hg19.txt" + ucsc_file = "https://raw.githubusercontent.com/stjude/ROSE/master/annotation/hg19_refseq.ucsc" } 'GRCh38' { fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" @@ -53,6 +55,8 @@ params { "150" : 2862089864, "200" : 2892537351 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/hg38.txt" + ucsc_file = "https://raw.githubusercontent.com/stjude/ROSE/master/annotation/hg38_refseq.ucsc" } 'GRCm38' { fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" @@ -75,6 +79,8 @@ params { "150" : 2492306232, "200" : 2519386924 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/mm10.txt" + ucsc_file = "https://raw.githubusercontent.com/stjude/ROSE/master/annotation/mm10_refseq.ucsc" } 'TAIR10' { fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" @@ -180,6 +186,7 @@ params { "150" : 1285330773, "200" : 1292538906 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/danRer10.txt" } 'BDGP6' { fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" @@ -197,6 +204,7 @@ params { "150" : 126903604, "200" : 128575605 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/dm6.txt" } 'EquCab2' { fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" @@ -336,6 +344,7 @@ params { "150" : 2405692811, "200" : 2407324495 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/rn5.txt" } 'Rnor_6.0' { fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" @@ -354,6 +363,7 @@ params { "150" : 2477334634, "200" : 2478552171 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/rn6.txt" } 'R64-1-1' { fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" @@ -462,6 +472,8 @@ params { "150" : 2862089864, "200" : 2892537351 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/hg38.txt" + ucsc_file = "https://raw.githubusercontent.com/stjude/ROSE/master/annotation/hg38_refseq.ucsc" } 'hg19' { fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" @@ -484,6 +496,8 @@ params { "150" : 2824648687, "200" : 2848794782 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/hg19.txt" + ucsc_file = "https://raw.githubusercontent.com/stjude/ROSE/master/annotation/hg19_refseq.ucsc" } 'mm10' { fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" @@ -506,6 +520,8 @@ params { "150" : 2492306232, "200" : 2519386924 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/mm10.txt" + ucsc_file = "https://raw.githubusercontent.com/stjude/ROSE/master/annotation/mm10_refseq.ucsc" } 'bosTau8' { fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" @@ -541,6 +557,7 @@ params { "150" : 98879728, "200" : 98769409 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/ce10.txt" } 'canFam3' { fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" @@ -576,6 +593,7 @@ params { "150" : 1285330773, "200" : 1292538906 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/danRer10.txt" } 'dm6' { fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" @@ -594,6 +612,7 @@ params { "150" : 126908682, "200" : 128599061 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/dm6.txt" } 'equCab2' { fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" @@ -666,6 +685,7 @@ params { "150" : 2477334634, "200" : 2478552171 ] + chromsizes = "https://raw.githubusercontent.com/jernst98/ChromHMM/master/CHROMSIZES/rn6.txt" } 'sacCer3' { fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" diff --git a/conf/modules.config b/conf/modules.config index 0e2d076f..f70e6c58 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -52,6 +52,28 @@ process { ext.suffix = "bed" } + withName: BED_TO_GFF { + ext.args = {"'BEGIN{FS=\"\\t\";OFS=\"\\t\"} { print \$1, \"bed2gff\", \"region\", \$2+1, \$3, \".\", \".\", \".\", \".\"}'"} + ext.prefix = {"$meta.id"} + ext.suffix = "gff" + } + + withName: REFORMAT_GFF { + ext.args = {"'BEGIN{FS=\"\\t\";OFS=\"\\t\"} {if(!match(\$1, /^chr/)) \$1=\"chr\"\$1; \$2=\"seq_\"NR; print \$1, \$2, \"\", \$4, \$5, \"\", \$7, \"\", \$2}'"} + ext.prefix = {"${meta.id}_reformatted"} + ext.suffix = "gff" + } + + withName: ROSE_OUTPUT_TO_BED { + ext.args = {"'BEGIN{FS=\"\\t\";OFS=\"\\t\"} {print \$1, \$4-1, \$5}'"} + ext.prefix = {"$meta.id"} + ext.suffix = "bed" + } + + withName: SUBTRACT_BLACKLIST { + ext.prefix = {"${meta.id}_subtracted"} + } + withName: ".*_FAIDX" { ext.suffix = "fai" } diff --git a/docker/chromhmm/Dockerfile b/docker/chromhmm/Dockerfile new file mode 100644 index 00000000..0dcd007a --- /dev/null +++ b/docker/chromhmm/Dockerfile @@ -0,0 +1,6 @@ +FROM ubuntu:latest + +RUN apt-get update && \ + apt-get install -y openjdk-17-jdk && \ + apt-get clean; + diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 6450aa31..8de182fb 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -15,3 +15,7 @@ services: build: context: ehmm image: bigdatainbiomedicine/inspect-ehmm + chromhmm: + build: + context: chromhmm + image: leonhafner/openjdk:17 diff --git a/main.nf b/main.nf index 70143034..550ac57a 100644 --- a/main.nf +++ b/main.nf @@ -17,12 +17,14 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') -params.gtf = WorkflowMain.getGenomeAttribute(params, 'gtf') -params.blacklist = WorkflowMain.getGenomeAttribute(params, 'blacklist') -params.pwm = WorkflowMain.getGenomeAttribute(params, 'pwm') -params.tax_id = WorkflowMain.getGenomeAttribute(params, 'tax_id') -params.promoters = WorkflowMain.getGenomeAttribute(params, 'promoters') +params.fasta = WorkflowMain.getGenomeAttribute(params, 'fasta') +params.gtf = WorkflowMain.getGenomeAttribute(params, 'gtf') +params.blacklist = WorkflowMain.getGenomeAttribute(params, 'blacklist') +params.pwm = WorkflowMain.getGenomeAttribute(params, 'pwm') +params.tax_id = WorkflowMain.getGenomeAttribute(params, 'tax_id') +params.promoters = WorkflowMain.getGenomeAttribute(params, 'promoters') +params.chromsizes = WorkflowMain.getGenomeAttribute(params, 'chromsizes') +params.ucsc_file = WorkflowMain.getGenomeAttribute(params, 'ucsc_file') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules/local/chip_atlas/filter.nf b/modules/local/chip_atlas/filter.nf index 9b65fb59..da3f7aa3 100644 --- a/modules/local/chip_atlas/filter.nf +++ b/modules/local/chip_atlas/filter.nf @@ -17,7 +17,7 @@ process FILTER { script: """ - #!/usr/bin/env python + #!/usr/bin/env python3 import pandas as pd import argparse diff --git a/modules/local/chromhmm/binarize_bams.nf b/modules/local/chromhmm/binarize_bams.nf new file mode 100644 index 00000000..4fab6ab1 --- /dev/null +++ b/modules/local/chromhmm/binarize_bams.nf @@ -0,0 +1,30 @@ +process BINARIZE_BAMS { + + label "process_high" + container "registry.hub.docker.com/leonhafner/openjdk:17" + + input: + path bams, stageAs: "reformatted_bams/*" + path bais, stageAs: "reformatted_bams/*" + path cellmarkfiletable + path chromsizes + + output: + path "binarized_bams" + + script: + """ + ChromHMM.jar BinarizeBam \ + $chromsizes \ + reformatted_bams \ + $cellmarkfiletable \ + binarized_bams + """ + + stub: + """ + mkdir binarized_bams + touch binarized_bams/chr1.txt + touch binarized_bams/chr2.txt + """ +} diff --git a/modules/local/chromhmm/get_output.nf b/modules/local/chromhmm/get_output.nf new file mode 100644 index 00000000..00a1f52a --- /dev/null +++ b/modules/local/chromhmm/get_output.nf @@ -0,0 +1,24 @@ +process GET_OUTPUT { + + label "process_single" + container "quay.io/biocontainers/pandas:1.4.3" + + input: + tuple val(meta), path(emissions), path(bed) + + output: + tuple val(meta), path("enhancers_${bed.baseName.split('_')[0]}.bed") + + script: + """ + get_chromhmm_results.py \ + --emissions $emissions \ + --bed $bed \ + --output enhancers_${bed.baseName.split('_')[0]}.bed + """ + + stub: + """ + touch enhancers_${bed.baseName.split('_')[0]}.bed + """ +} \ No newline at end of file diff --git a/modules/local/chromhmm/learn_model.nf b/modules/local/chromhmm/learn_model.nf new file mode 100644 index 00000000..99b216ee --- /dev/null +++ b/modules/local/chromhmm/learn_model.nf @@ -0,0 +1,36 @@ +process LEARN_MODEL { + + label "process_high" + container "registry.hub.docker.com/leonhafner/openjdk:17" + + input: + path binarized_bams + val states + + output: + path "ChromHMM_output/emissions_${states}.txt", emit: emissions + path "ChromHMM_output/*_${states}_dense.bed", emit: beds + + script: + """ + # Organism (PLACEHOLDER) only needed for downstream analysis of ChromHMM and therefore not supplied + + ChromHMM.jar LearnModel \ + -p $task.cpus \ + $binarized_bams \ + ChromHMM_output \ + $states \ + PLACEHOLDER + """ + + stub: + """ + mkdir ChromHMM_output + touch ChromHMM_output/emissions_${states}.txt + touch ChromHMM_output/L1_${states}_dense.bed + touch ChromHMM_output/L10_${states}_dense.bed + touch ChromHMM_output/P6_${states}_dense.bed + touch ChromHMM_output/P13_${states}_dense.bed + """ +} + diff --git a/modules/local/chromhmm/make_cellmarkfiletable.nf b/modules/local/chromhmm/make_cellmarkfiletable.nf new file mode 100644 index 00000000..9a59e6a9 --- /dev/null +++ b/modules/local/chromhmm/make_cellmarkfiletable.nf @@ -0,0 +1,21 @@ +process MAKE_CELLMARKFILETABLE { + + label "process_single" + container "quay.io/biocontainers/pandas:1.4.3" + + input: + path bam_design + + output: + path "cellmarkfiletable.txt" + + script: + """ + make_cellmarkfiletable.py --input $bam_design --output cellmarkfiletable.txt + """ + + stub: + """ + touch cellmarkfiletable.txt + """ +} diff --git a/modules/local/chromhmm/reformat_bam.nf b/modules/local/chromhmm/reformat_bam.nf new file mode 100644 index 00000000..367d1306 --- /dev/null +++ b/modules/local/chromhmm/reformat_bam.nf @@ -0,0 +1,23 @@ +process REFORMAT_BAM { + + label "process_medium" + container "registry.hub.docker.com/staphb/samtools" + + input: + tuple val(meta), path(bamFileIn, stageAs: "input/*") + // stage input to avoid name clashes + + output: + tuple val(meta), path("${bamFileIn.baseName}.${bamFileIn.extension}") + + script: + """ + reformat_bam.sh $bamFileIn ${bamFileIn.baseName}.${bamFileIn.extension} + """ + + stub: + """ + touch ${bamFileIn.baseName}.${bamFileIn.extension} + """ +} + diff --git a/modules/local/counts/create_anndata.nf b/modules/local/counts/create_anndata.nf index 9fd239dc..b79bc615 100644 --- a/modules/local/counts/create_anndata.nf +++ b/modules/local/counts/create_anndata.nf @@ -15,7 +15,7 @@ process CREATE_ANNDATA { script: """ - #!/usr/bin/env python + #!/usr/bin/env python3 import anndata as ad import pandas as pd diff --git a/modules/local/eh_atlas/fetch_links.nf b/modules/local/eh_atlas/fetch_links.nf index 12c57321..20c235bb 100644 --- a/modules/local/eh_atlas/fetch_links.nf +++ b/modules/local/eh_atlas/fetch_links.nf @@ -12,7 +12,7 @@ process FETCH_LINKS { script: """ - #!/usr/bin/env python + #!/usr/bin/env python3 import bs4 import requests diff --git a/modules/local/ehmm/construct_model.nf b/modules/local/ehmm/construct_model.nf index 46144a39..fc0dee20 100644 --- a/modules/local/ehmm/construct_model.nf +++ b/modules/local/ehmm/construct_model.nf @@ -10,7 +10,7 @@ process CONSTRUCT_MODEL { output: path('model'), emit: model_dir - path('model/model.txt'), emit: model + path('model/model.RData'), emit: model script: """ diff --git a/modules/local/liftover.nf b/modules/local/liftover.nf index a52ec06c..9b9d1a78 100644 --- a/modules/local/liftover.nf +++ b/modules/local/liftover.nf @@ -14,7 +14,7 @@ process LIFTOVER { script: """ - #!/usr/bin/env python + #!/usr/bin/env python3 from easyliftover import liftover_path diff --git a/modules/local/ranking/TFTG.nf b/modules/local/ranking/TFTG.nf index 356f2cc6..1ed5b03f 100644 --- a/modules/local/ranking/TFTG.nf +++ b/modules/local/ranking/TFTG.nf @@ -15,7 +15,7 @@ process TFTG_SCORE { script: """ - #!/usr/bin/env python + #!/usr/bin/env python3 import pandas as pd import argparse diff --git a/modules/local/ranking/collect_tfs.nf b/modules/local/ranking/collect_tfs.nf index 7d656fde..e7939d3d 100644 --- a/modules/local/ranking/collect_tfs.nf +++ b/modules/local/ranking/collect_tfs.nf @@ -15,7 +15,7 @@ process COLLECT_TFS { script: """ - #!/usr/bin/env python + #!/usr/bin/env python3 import json diff --git a/modules/local/rose/run_rose.nf b/modules/local/rose/run_rose.nf new file mode 100644 index 00000000..b25512c8 --- /dev/null +++ b/modules/local/rose/run_rose.nf @@ -0,0 +1,28 @@ +process RUN_ROSE { + tag "${meta.state}" + + label "process_single" + container "registry.hub.docker.com/library/python:3.11" + + input: + tuple val(meta), path(gff) + path ucsc_file + + output: + tuple val(meta), path("${gff.baseName}_STITCHED.gff") + + script: + """ + rose.py \ + -g ${ucsc_file} \ + -i ${gff} \ + -o ${gff.baseName}_STITCHED.gff \ + -s 12500 \ + -t 2500 + """ + + stub: + """ + touch "${gff.baseName}_STITCHED.gff" + """ +} \ No newline at end of file diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py index e55b8d43..2d514ce0 100755 --- a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 """Provide functions to merge multiple versions.yml files.""" diff --git a/subworkflows/local/chromhmm.nf b/subworkflows/local/chromhmm.nf new file mode 100644 index 00000000..cfbd8857 --- /dev/null +++ b/subworkflows/local/chromhmm.nf @@ -0,0 +1,36 @@ +include { MAKE_CELLMARKFILETABLE } from "../../modules/local/chromhmm/make_cellmarkfiletable" +include { REFORMAT_BAM } from "../../modules/local/chromhmm/reformat_bam" +include { SAMTOOLS_INDEX as INDEX_BAM } from "../../modules/nf-core/samtools/index/main" +include { BINARIZE_BAMS } from "../../modules/local/chromhmm/binarize_bams" +include { LEARN_MODEL } from "../../modules/local/chromhmm/learn_model" +include { GET_OUTPUT } from "../../modules/local/chromhmm/get_output" + + + +workflow CHROMHMM { + take: + raw_bams + chromhmm_states + chromsizes + + + main: + MAKE_CELLMARKFILETABLE(Channel.fromPath(params.bam_design2)) + + REFORMAT_BAM(raw_bams) + + INDEX_BAM(REFORMAT_BAM.out) + + // map() is used to remove metadata + BINARIZE_BAMS(REFORMAT_BAM.out.map(entry -> entry[1]).collect(), INDEX_BAM.out.bai.map(entry -> entry[1]).collect(), MAKE_CELLMARKFILETABLE.out, chromsizes) + + LEARN_MODEL(BINARIZE_BAMS.out, chromhmm_states) + + // Add meta [state: {state}] to channel + ch_emission_bed = LEARN_MODEL.out.emissions.combine(LEARN_MODEL.out.beds.flatten()).map{item -> [[state: item[1].baseName.split("_")[0]], item[0], item[1]]} + + GET_OUTPUT(ch_emission_bed) + + emit: + enhancers = GET_OUTPUT.out +} diff --git a/subworkflows/local/peaks.nf b/subworkflows/local/peaks.nf index 354bc6cd..6821f611 100644 --- a/subworkflows/local/peaks.nf +++ b/subworkflows/local/peaks.nf @@ -15,6 +15,8 @@ include { COMBINE_TABLES as AFFINITY_RATIO } from "../../modules/local/combine_t include { COPY } from "../../modules/local/copy" include { EHMM } from './ehmm' include { GAWK as REMOVE_CHR } from "../../modules/nf-core/gawk/main" +include { CHROMHMM } from "./chromhmm" +include { ROSE } from "./rose" workflow PEAKS { take: @@ -69,6 +71,7 @@ workflow PEAKS { REMOVE_CHR(ch_footprints, []) ch_footprints_nochr = REMOVE_CHR.out.output + /* if (params.bam_design) { ch_bams = Channel.value(file(params.bam_design)) .splitCsv(header: true) @@ -99,7 +102,61 @@ workflow PEAKS { ch_footprints = EHMM.out.all } + */ + + if (params.bam_design2) { + // Parse design file + ch_bams = Channel.value(file(params.bam_design2)) + .splitCsv(header: ["state", "assay", "bam", "control"], sep: '\t') + .map{ + row -> [ + [ + id: row["state"] + "_" + row["assay"], + state: row["state"], + assay: row["assay"], + ], + [ + file(row["bam"]), file(row["control"]) + ], + ] + } + .multiMap{ + row -> + normal_bams: + [row[0], row[1][0]] + control_bams: + [row[0], row[1][1]] + } + // Reunite channels with unique controls + ch_bams = ch_bams.normal_bams.mix(ch_bams.control_bams.unique{ it[1] }) + + CHROMHMM( + ch_bams, + params.chromhmm_states, + params.chromsizes + ) + + // Add id field to meta object for use in nf-core modules + ch_chromhmm_enhancers = CHROMHMM.out + .map{meta, path -> + meta = meta + [id: "enhancers_" + meta.state] + [meta, path] + } + + ROSE( + ch_chromhmm_enhancers, + params.ucsc_file + ) + // Adapt process output to previous design of ch_footprints + ch_footprints = ROSE.out + .map{meta, bed -> + meta = meta + [antibody: "enhancers"] + [meta, bed] + } + + } + if (params.blacklist) { SUBTRACT_BLACKLIST(ch_footprints.map{ meta, bed_file -> [meta, bed_file, params.blacklist]}) ch_blacklisted = SUBTRACT_BLACKLIST.out.bed diff --git a/subworkflows/local/rose.nf b/subworkflows/local/rose.nf new file mode 100644 index 00000000..6f4f1dc6 --- /dev/null +++ b/subworkflows/local/rose.nf @@ -0,0 +1,22 @@ +include { GAWK as BED_TO_GFF } from "../../modules/nf-core/gawk/main" +include { GAWK as REFORMAT_GFF } from "../../modules/nf-core/gawk/main" +include { RUN_ROSE } from "../../modules/local/rose/run_rose" +include { GAWK as ROSE_OUTPUT_TO_BED } from "../../modules/nf-core/gawk/main" + +workflow ROSE { + take: + input_bed + ucsc_file + + main: + BED_TO_GFF(input_bed, []) + + REFORMAT_GFF(BED_TO_GFF.out.output, []) + + RUN_ROSE(REFORMAT_GFF.out.output, ucsc_file) + + ROSE_OUTPUT_TO_BED(RUN_ROSE.out, []) + + emit: + stitched_enhancers = ROSE_OUTPUT_TO_BED.out.output +} \ No newline at end of file