-
Notifications
You must be signed in to change notification settings - Fork 6
/
Snakefile
195 lines (183 loc) · 8.05 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# ------------------------------------------------------------------------------
# Here we define all rules which are used in context of the HiC enrichment
# analyses. This entails e.g. the rules to preprocess the raw data from the
# Javierre et al. 2016 paper.
# The long-range HiC data is taken from the Javierre et al. supplement.
# Here we use the TAD definitions and high-confidence HiC contacts.
# We use the raw data to perform an enrichment analysis for the trans-meQTL
# associations.
#
# @author Johann Hawe <[email protected]>
# ------------------------------------------------------------------------------
# input/output directories and files
HIC_RESULT_DIR="javierre-2016/"
TAD_DIR = "javierre-2016/TADs/"
CTs = glob_wildcards(TAD_DIR + "/TADs_{celltype}_mean_merged.bed")
# what kinds of meqtl to process
MEQTL_TYPES = ["longrange", "trans"]
# number of iterations for the enrichment sampling
ITERS = range(1,151)
# ------------------------------------------------------------------------------
# Define local rules (not to be run on cluster)
# ------------------------------------------------------------------------------
localrules: hicup2homer, merge_tag_counts, get_tag_counts, all_hic, merge_tads
# -----------------------------------------------------------------------------
# Get the homer input files from the hicup results.
# As of now, it seems we have to create the hicup processed bam files manually.
# (the inputs for this rule)
# See the projects README for info on how to get those
# -----------------------------------------------------------------------------
HIC = glob_wildcards("hicup/{sample}.hicup.bam")
rule hicup2homer:
input:
HIC_RESULT_DIR + "hicup/{sample}.hicup.bam"
output:
HIC_RESULT_DIR + "hicup/{sample}.hicup.bam.homer"
threads: 1
resources:
mem_mb=1000
params:
time = "00:30:00"
log:
"logs/hicup2homer/{sample}.log"
benchmark:
"benchmarks/hicup2homer/{sample}.log"
shell:
"""
hicup2homer {input} &> {log}
"""
# -----------------------------------------------------------------------------
# Get the tag counts for specific bins for all contact regions from the HOMER
# files using an awk script. Retain only trans associations.
# -----------------------------------------------------------------------------
rule get_tag_counts:
input:
HIC_RESULT_DIR + "hicup/{sample}.hicup.bam.homer"
output:
HIC_RESULT_DIR + "tag_counts/{res}/{sample}_counts.txt"
threads: 1
resources:
mem_mb=500
params:
time = "00:40:00"
log:
"logs/get_tag_counts/{sample}_{res}.log"
benchmark:
"benchmarks/get_tag_counts/{sample}_{res}.bmk"
shell:
"""
res={wildcards.res}
awk '{{OFS="\\t"; if($2 != $5) {{ b=int($3/'$res')*'$res'; d=int($6/'$res')*'$res'; a=$2; c=$5; if($2>$5) {{ a=$5; e=b; b=d; d=e; c=$2 }}; print(a,b,c,d) }} }}' {input} | \
sort | uniq -c | awk '{{OFS="\\t"; print($1,$2,$3,$4,$5) }}' > {output}
"""
# -----------------------------------------------------------------------------
# Merge all tag count files and filter for a total number of reads per overlap
# -----------------------------------------------------------------------------
rule merge_tag_counts:
input:
expand(HIC_RESULT_DIR + "tag_counts/{{res}}/{sample}_counts.txt", sample=HIC.sample)
output:
HIC_RESULT_DIR + "tag_counts/{res}/merged_gte{cutoff}.txt"
threads: 1
resources:
mem_mb=5000
params:
time = "01:00:00"
shell:
"""
cut -f 2,3,4,5 {input} | sort | uniq -c | awk '{{OFS="\\t"; if($1>={wildcards.cutoff}) print($1,$2,$3,$4,$5)}}' - > {output}
"""
# -----------------------------------------------------------------------------
# get a merged bed file for all TADs identified in the individual cell-types
# possible extension: annotate the cell-types for the individual regions
# -----------------------------------------------------------------------------
rule merge_tads:
input:
expand("javierre-2016/TADs/TADs_{ct}_mean_merged.bed", ct=CTs.celltype)
output:
HIC_RESULT_DIR + "TADs_merged.bed"
shell:
"cat {input} | sort -nr | uniq | sort -n > {output}"
# -----------------------------------------------------------------------------
# create a simple text file containing a list of all measured cpgs from KORA
# data and the meQTL cpg ids from the cosmo object
# This is used for doing the sample based HiC enrichment analysis
# -----------------------------------------------------------------------------
rule create_entity_lists:
input:
beta="KF4_beta_qn_bmiq.RData",
cosmo="cosmopairs_combined_151216.RData"
output:
beta="measured-cpgs.txt",
cosmo_cpg="cosmo-cpgs.txt",
cosmo_snp="cosmo-snps.txt"
threads: 1
resources:
mem_mb = 7000
params:
time = "00:30:00"
log:
"logs/create_entity_lists.log"
script:
"../R/extract-cpg-ids.R"
# ------------------------------------------------------------------------------
# Generates input pairs for HiC enrichment runs
# ------------------------------------------------------------------------------
rule generate_enrichment_pairs:
input:
pruned = "sentinel_pairs_r02_110417.RData",
cosmo = "cosmopairs_combined_151216.RData"
output:
HIC_RESULT_DIR + "pairs_to_analyze/{category}_pairs.tsv"
threads: 1
resources:
mem_mb = 4000
params:
time = "00:20:00"
log:
"logs/tad_hic_enrichment/generate_enrichment_pairs/{category}.log"
benchmark:
"benchmarks/tad_hic_enrichment/generate_enrichment_pairs/{category}.bmk"
script:
"../R/tad_hic_enrichment/generate_enrichment_pairs.R"
# ------------------------------------------------------------------------------
# Generic rule to perform enrichment on arbitrary pairs
# Actual pairs have to be provided in the 'pairs' input and should contain
# four columns: sentinel.snp, snps, sentinel.cpg, cpgs; providing for each
# 'sentinel pair' the original list of snps and cpgs.
# Currently, to analyze a different set of pairs one has to duplicate the rule,
# rename accordingly and adjust the 'pairs' input file as desired.
# ------------------------------------------------------------------------------
rule enrich_meqtl_hic:
input:
pchic="PCHiC_peak_matrix_cutoff5.tsv",
hic="hic_merged_gte3.txt",
tads="TADs_merged.bed",
pairs="pairs_to_analyze/{category}_pairs.tsv",
cpg_info="cosmo-cpgs.txt",
cpgs_measured="measured-cpgs.txt",
snp_info="cosmo-snps.txt",
msd="cpg_annotation_lolipop.rds",
allele_freqs="allele_frequencies_lolipop.rds",
output:
HIC_RESULT_DIR + "{category}_enrichment/{res}/iteration{i}.RData"
resources:
mem_mb=2000
threads: 16
params:
dist_tolerance=10000,
hic_ext=1000,
time = "05:00:00"
log:
"logs/enrich_meqtl/{category}_res{res}_iter{i}.log"
benchmark:
"benchmarks/enrich_meqtl/{category}_res{res}_iter{i}.bmk"
script:
"../R/tad_hic_enrichment/enrich_tad_hic.R"
# -----------------------------------------------------------------------------
# target rule to calculate all enrichment iterations
# -----------------------------------------------------------------------------
rule all_hic:
input:
expand(HIC_RESULT_DIR + "trans_enrichment/50000/iteration{i}.RData", i=ITERS)
expand(HIC_RESULT_DIR + "longrange_enrichment/50000/iteration{i}.RData", i=ITERS)