-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathREADME.Rmd
577 lines (422 loc) · 24.6 KB
/
README.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
---
output: github_document
indent: true
---
<!-- README.md is generated from README.Rmd. Please edit that file -->
```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.path = "man/figures/README-"
# out.width = "100%"
)
```
# CLIPreg
<!-- badges: start -->
<!-- badges: end -->
The goal of CLIPreg is to discover key RBP regulators in different datasets. It combines CLIP-seq with RNA- and RIBO-seq to calculate enrichment of RBP and generate plots for publications.
Another feature that can be analyzed by CLIPreg is enrichment of miRNA targets from TargetScan database.
## Installation
### Check and install required packages
Users may use following codes to check and install all the required packages.
``` r
list.of.packages <- c("ggplot2","grid","doParallel","foreach","data.table","fastmatch","GGally","ggnet","topGO","ALL","devtools","org.Hs.eg.db","DESeq2")
## for package "ggplot2", "pheatmap", "grid", "doParallel", "foreach", "data.table", "fastmatch", "GGally"
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
## for package "topGO", "ALL", "ggnet", "ComplexHeatmap","org.Hs.eg.db", "DESeq2"
if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) BiocManager::install(new.packages)
if("ggnet"%in%new.packages) devtools::install_github("briatte/ggnet")
devtools::install_version("network", version = "1.16.1", repos = "http://cran.us.r-project.org")
if("ComplexHeatmap"%in%new.packages) devtools::install_github("jokergoo/ComplexHeatmap")
```
### Install CLIPreg
The source code of CLIPreg can be installed from [GitHub](https://github.com/) with:
``` r
devtools::install_github("SGDDNB/CLIPreg")
```
CLIPreg requires 4 different inputs. A gene_groups input which is a dataframe containing geneID and the gene_groups given by DeltaTE output (ideally). The DeltaTE method can be found in the paper here: https://doi.org/10.1002/cpmb.108. It categorizes transcriptionally and/or translationally regulated genes into 4 categories: forwarded, intensified, exclusive and buffered, each category with an up or down direction. A deltaTE function is included in this package, it can be found in advance usage step 0.
The 3 other required files are : summarized CLIP-seq data, with POSTAR and ENCODE which are pre-loaded in the package, RIBO log fold change (lfc) and TPM.
```{r load library, message=FALSE,warning=FALSE}
## load libraries
library(CLIPreg)
library(ggplot2)
library(ComplexHeatmap)
library(grid)
library(doParallel)
library(ggnet)
library(topGO)
library(GGally)
library(data.table)
library(stringr)
library(DESeq2)
```
## Basic usage
For testing the package with the example data. This will load the summary data of POSTAR and ENCODE as well as the TPM, lfc and gene_groups from the fibroblast dataset used in the paper.
```{r load example data, message=FALSE, eval=F}
Example=Load_example()
```
For using your own data, you must specify a folder that contains at least 3 txt files that are named: gene_groups.txt, ribo_lfc.txt and ribo_tpm.txt. For the format of those 3 files, please refer to Advance usage step 1.
```{r load user data, message=FALSE, eval=F}
Input_data=Load_input_files(folder = "path/to/folder")
```
Run the analysis with all default parameters.
```{r Run default analysis, message=FALSE, eval=F}
results=run_CLIPreg(Example, is.example=T) # or run_CLIPreg(Input_data, is.example=F)
```
Generate the visual output from the results of the analysis. The Visualise function will create 2 new files in the folder given: an Rdata file containing the list object from the run_CLIPreg function and a pdf file with the main figures. The results are created with all default parameters.
```{r RVisualize, message=FALSE,warning=FALSE, eval=F}
dir.create("Results_CLIPreg")
Visualise(results=results,folder="Results_CLIPreg")
```
## Advance usage
### Step 0 : Run deltaTE to find gene groups
This has to be done on the user's dataset from raw counts of RIBO and RNA as well as a coldata dataframe that can run through DESeq2. Put batch = 1 if there is a batch column in the design.
The counts must have a first column containing the geneID or gene names, they must be unique. The other columns are the raw counts of RNA and RIBO. The column names for the counts must be the same names as filled in the "SampleID" column of coldata.
The coldata dataframe must have column names "SampleID", "SeqType", "Condition" and "Batch". (Batch is an optional column). Condition must contain 1 and 2. For example, in an untreated vs treatment drug experiment, set the untreated sample condition to 1 and treatment to 2.
An example is provided in the github folder Example/Example_deltaTE.R . This function creates a folder in your current directory where it will write the DESeq2 output for RNA and RIBO as well as the gene lists of the different gene categories.
```{r create gene groups from deltaTE, message=FALSE, eval=F}
gene_groups=deltaTE_gene_groups(counts=counts,coldata=coldata,batch=0)
```
### Step 1: Input datasets
Let's have a look at the gene_groups file from the example data. It consists in 2 columns containing the "geneID" and the "Gene_group" for all the DE genes. This is as an example on fibroblasts stimulated by TGFB, when you start with your own gene groups, go to next code line. Ideally, the gene groups are obtained from deltaTE analysis but any method to find gene groups can also be run.
```{r load gene_groups, message=FALSE}
data("gene_groups")
head(gene_groups)
```
To input your own gene groups, use the function load_gene_groups() and give the file location of your gene group file as input. Each gene group must contain "up" or "down" in the name or you won't be able to plot the network and gene ontology.
```{r load gene_groups from user, message=FALSE, eval=F}
gene_groups=load_gene_groups(gene_groups_file = "path/to/gene_groups_file.txt")
```
Load POSTAR and ENCODE RBP data. Those are 2 public datasets which are processed in order to have lists of vector. See "Pre-proessing" part of this readme to find out more details. Each vector is named after 1 RBP and contains the geneID of all the targets of that RBP. Combine both data in a target list in the variable Targets.
combine_targets function is a summary of both RBP data by filtering only targets which are present in the background. This is to save time for the later analysis in the clipreg function.
```{r load RBP data, message=FALSE}
data("RBP_ENCODE")
data("RBP_POSTAR")
Targets=combine_targets(RBP_list1=RBP_ENCODE,RBP_list2=RBP_POSTAR,background=gene_groups$geneID)
Targets[c("POLR2G","PPIL4","SBDS")]
```
Load the fold change and identify the RBPs. If you have your own then provide your own data.
ribo_lfc has 3 variables : "geneID", "IDENTIFIER" and "Log2FoldChange". ribo_lfc only contains genes that have been filtered to be significantly changing at the ribo level.
tpm_ribo must have only counts as columns and geneID as rownames
```{r loading LFC and TPM, message=FALSE}
# load fold change and tpm. optional if you want to use the input data
data("ribo_lfc")
data("tpm_ribo")
head(ribo_lfc)
head(tpm_ribo[,1:4])
```
To load your own data use:
```{r loading your own LFC and TPM, message=FALSE,eval=F}
# If you want to input your own data
ribo_lfc=load_ribo_lfc(ribo_lfc_file = "ribo_lfc_file")
tpm_ribo=load_ribo_tpm(ribo_tpm_file = "ribo_tpm_file") # make sure rownames(tpm_ribo) corresponds to the geneID
```
### Step 2: Data integration and analysis
Run the enrichment analysis using CLIPreg() function. This takes several minutes depending on how many gene groups and how many genes per gene group there are. If you want to have a look at the example results skip these 4 lines of code.
For each RBP in each gene group, this function computes the over representation of the RBP's targets by calculating z-score, p-value and padj (corresponding to FDR).
The output of CLIPreg() is a list of dataframes. One dataframe per gene group containing the RBP and statistical information calculated during the analysis such as p-value and z-score.
```{r run enrichment analysis, message=FALSE,eval=F}
# The CLIPreg function requires a few minutes to run, save the data after running it to be sure not to lose it
res_Postar=CLIPreg(Regulator_data=RBP_POSTAR,gene_groups=gene_groups)
res_Encode=CLIPreg(Regulator_data=RBP_ENCODE,gene_groups=gene_groups)
save(res_Encode,file="Res_RBP_Encode.RData")
save(res_Postar,file="Res_RBP_Postar.RData")
```
If you want to get the results directly you can load it by using the example data results. The result is a list of 8 dataframes, 8 being the number of the gene groups found in this dataset. Each dataframe contains a statistic summary of the enrichment of the targets of the RBP. The CLIPreg function calculates the over-representation of RBP targets in each of the gene groups. This is done by calculating the empirical p-value of the frequency of interactions between any given RBP and each individual DeltaTE group by comparing the number of observed interactions with a null distribution generated from repeated shuffling (n = 100,000 iterations) of the RBP-mRNA interactions.
real_overlap corresponds to the actual overlap between the RBP targets and the gene group while simulated_overlap_mean and sd correspond to the mean and average of the null distribution.
```{r load res, message=FALSE}
data("res_Encode")
data("res_Postar")
head(res_Encode$exclusive_down)
```
Then we want to combine POSTAR and ENCODE to work with only one dataframe and only keep RBPs that are significant in at least one gene group. If an RBP is present in both POSTAR and ENCODE, only the most significant result is kept.
```{r combining POSTAR and ENCODE, message=FALSE}
res=CLIPreg::combine(res1=res_Encode,res2=res_Postar,FDR=0.05)
head(res[[1]])
```
Extract the RBP fold change from the ribo_lfc and keep only detected RBPs in res. If an RBP is not changing, it doesn't make sense to look at the enrichment of its targets so you should shortlist the RBP to only keep the changing ones.
```{r Extract LFC of RBPs, message=FALSE}
# Change of RBPs
rbp_lfc=rbp_change(res=res,ribo_lfc=ribo_lfc)
head(rbp_lfc)
# Cure res data by removing RBPs that are not in the rbp_lfc dataframe
res=cure_res(res=res,regulators=rbp_lfc)
head(res[[1]])
```
### Step 3: Visualisation
Generate and save heatmap to pdf. The heatmap represents the -logFDR of each RBP for each gene group. The blue RBPs are downregulated and the orange RBPs are upregulated. Only RBPs with targets that are significantly enriched in at least one gene group are shown.
```{r Generate heatmap,fig.height=15,fig.width=10}
# Heatmap of RBP scores
HeatmapRBP(res=res,rbp_lfc=rbp_lfc)
# If there is not at least 1 positive and 1 negative RBP lfc then use the heatmap for miRNA
# Heatmap_no_fold_change(res=res)
```
Suggestion for saving your heatmap
```{r save heatmap to pdf, eval=F}
# Save the heatmap
p=HeatmapRBP(res=res,rbp_lfc=rbp_lfc)
location="Heatmap_fibroblasts.pdf"
n=length(p@ht_list$`RBP direction`@matrix)
pdf(location,length(names(res)),3+n*0.15)
dev.off()
```
A bubble plot can be generated to see the overall representation of the RBPs' targets. This can only be done if the gene groups are generated from deltaTE_gene_groups function or with gene groups having similar names to deltaTE groups.
```{r Bubble plot, message=FALSE}
# Bubble plot gene_groups if gene_groups are from DeltaTE. FDR has to be lower or equal to the FDR put in CLIPreg::combine() step
BubbleRBPs(res = res,gene_groups = gene_groups,FDR=0.05)
```
From the results, the user can choose a number of RBP to draw the network for by n. This will pick the n most changing RBPs based on fold change for the network. In the case where the user wants to draw a network with specific RBPs, he can input rpb_lfc as a character vector containing the RBP of interest
```{r Network,message=FALSE,warning=FALSE}
# Draw network of the RBP that are most changing or choose specific RBPs
Draw_network_by_group(regulators=rbp_lfc,res=res,Targets=Targets,gene_groups=gene_groups,n=5,forwarded = F)
# You can subset your RBP_lfc to keep only your RBPs of interest. Make sure n = the number of RBP you want to plot.
# Draw_network_by_group(rbp_lfc=c("CELF2","HNRNPF","DDX24"),
# res=res,Targets=Targets,gene_groups=gene_groups,n=3,forwarded = F)
```
Gene ontology can be plotted for specific nodes or RBP. The P-value corresponds to Fisher's exact test p-value. This p-value is obtained following the steps of the topGO vignette package. This p-value is not corrected.
```{r GO of specific nodes,fig.width=15, message=FALSE,warning=FALSE}
# plot GO, each plot takes a couple of minutes to generate.
Plot_GO(regulators=rbp_lfc,res=res,Targets=Targets,gene_groups=gene_groups,n=5,
tpm_ribo = tpm_ribo,th=200,GO_to_show=3,forwarded = F)
Plot_GO_node_name(regulators=rbp_lfc,res=res,Targets=Targets,gene_groups=gene_groups,n=5,
tpm_ribo = tpm_ribo,Nodes_to_keep=c(19,15),GO_to_show=3,forwarded = F)
Plot_GO_RBP(rbp_of_interest="QKI",tpm_ribo = tpm_ribo,Targets=Targets,gene_groups=gene_groups,GO_to_show=3)
```
## Analysis of miRNA target enrichment
The same analysis can be applied to run miRNA target enrichment. Here is an example of the code that is very similar to the steps followed for the RBPs.
```{r Scipt for miRNA loading necessary data}
data("miR_data") # preparation can be found below in section "Processing of miRNA file"
data("miR_info")
data("gene_groups") # for example on fibroblasts
Targets=GetTarget(Regulator_data=miR_data,background=gene_groups$geneID)
data("ribo_lfc")
data("tpm_ribo")
data("tpm_all_RNA")
# It's important to only keep miR which are detected transcriptionally
tpm_all_RNA=tpm_all_RNA[rowSums(tpm_all_RNA > 1) >= 1, ]
miR_info=miR_info[miR_info$ensembl_gene_id%in%rownames(tpm_all_RNA),]
miR_data=subset(miR_data,names(miR_data)%in%miR_info$mirbase_id)
```
You can then run the CLIPreg function to the data to get the enrichment details in each gene group.
```{r Scipt for miRNA run CLIPreg,eval=F}
#This is done on the example set, you can skip to next code for the results
res_miR=CLIPreg(Regulator_data=miR_data,gene_groups=gene_groups) # Takes several minutes
save(res_miR,file="Res_miR.RData")
```
The output format is the same as for CLIPreg run on RBPs
```{r Scipt for miRNA showing results}
data("Res_miR")
res=CLIPreg::combine(res1=res_miR,res2 = res_miR,FDR=0.05)
head(res[[1]])
```
A heatmap can be plotted also for miRNA
```{r Scipt for miRNA heatmap}
Heatmap_no_fold_change(res=res)
# Suggestion to save the heatmap
# e=Heatmap_no_fold_change(res=res)
# location="Heatmap_HeLa_EGF_miRNA.pdf"
# n=nrow(e@matrix)
# pdf(location,length(names(res))+3,3+n*0.15)
# e
# dev.off()
```
We found that miRNAs targets were enriched in the forwarded down and buffered up groups as shown in this heatmap. These were both groups where the RNA levels are downregulated which aligns with the known function of miRNAs to silence or downregulate RNA transcription. Moreover we also found some miRNAs that have been known to be important in fibroblast activation such as miR-199a-5p which is known to promote pathogenic activation of fibroblast by regulating CAV1 gene (Lino Cardenas et al., 2013). We not only found CAV1 as a target of miR-199a-5p in the forwarded down group but also 100s/1000s of other targets that could potentially be playing a role in this miRNA’s regulatory network towards fibroblast activation.
Network and GO plots are also available for miRNA
```{r Scipt for miRNA plots}
# Network and GO
# As we don't have fold change for miRNA, you can input miR names in the regulators for the plots.
Draw_network_by_group(regulators=c("hsa-mir-301a","hsa-mir-454","hsa-mir-544a","hsa-mir-106b","hsa-mir-148b"),res=res,Targets=Targets,gene_groups=gene_groups,n=5,forwarded = T)
Plot_GO_RBP(rbp_of_interest="hsa-mir-301a",tpm_ribo = tpm_ribo,Targets=Targets,gene_groups=gene_groups,GO_to_show=3)
```
## Example 2: CLIPreg without using deltaTE
HeLa stimulated by EGF. Groups were obtained by categorizing into TE_up and TE_down from data given in the paper https://pubmed.ncbi.nlm.nih.gov/30466063/. Data preparation can be found in the "Pre-Processing" part of the readme.
First loading the required tables.
``` {r CLIPreg on HeLa stimulated by EGF, fig.width=15, message=FALSE,warning=FALSE,eval=F}
# RBP analysis
data("gene_groups_HeLa_EGF")
data("ribo_lfc_HeLa_EGF")
data("tpm_ribo_HeLa_EGF.RData")
data("RBP_ENCODE")
data("RBP_POSTAR")
Targets_HeLa=combine_targets(RBP_list1=RBP_ENCODE,RBP_list2=RBP_POSTAR,background=gene_groups$geneID)
```
Run RBP target enrichment.
```{r Running CLIPreg on HeLa main,eval=F}
res_Postar_HeLa=CLIPreg(Regulator_data=RBP_POSTAR,gene_groups=gene_groups)
res_Encode_HeLa=CLIPreg(Regulator_data=RBP_ENCODE,gene_groups=gene_groups)
save(res_Encode_HeLa,file="Res_Encode_HeLa.RData")
save(res_Postar_HeLa,file="Res_Postar_HeLa.RData")
```
```{r clean data HeLa RBP res}
data("Res_Encode_HeLa")
data("Res_Postar_HeLa")
res_HeLa=CLIPreg::combine(res1=res_Encode_HeLa,res2=res_Postar_HeLa,FDR=0.05)
rbp_lfc=rbp_change(res=res_HeLa,ribo_lfc=ribo_lfc) # None of the RBP are changing
res_HeLa=cure_res(res=res_HeLa,regulators=rbp_lfc)
rbp_lfc
```
As we can see, none of the RBP are changing. If you still want to plot the heatmap although no RBP are found changing, you can use the Heatmap_no_fold_change function which doesn't require regulators.
```{r plots HeLa RBP}
Heatmap_no_fold_change(res=res_HeLa) # The RBP are not changing --> need to use the miRNA heatmap
```
Not enough RBP are significant with enough targets to give any meaning to the network and gene ontology plot
```{r network not enough genes,eval=F}
# Draw_network_by_group(regulators=c("GRWD1","GRSF1","SUGP2","DHX30","LSM11"),res=res,Targets=Targets_HeLa,gene_groups=gene_groups,n=5,forwarded = F)
# Plot_GO_RBP(rbp_of_interest="GRWD1",tpm_ribo = tpm_ribo,Targets=Targets_HeLa,gene_groups=gene_groups,GO_to_show=3)
```
miRNA analysis can also be applied to this data. Look at "Analysis of miRNA target enrichment" in this readme for more details about how miR enrichment steps work.
``` {r miR on HeLa stimulated by EGF, fig.width=15, message=FALSE,warning=FALSE,eval=F}
# miRNA analysis
data("gene_groups_HeLa_EGF")
data("ribo_lfc_HeLa_EGF")
data("tpm_ribo_HeLa_EGF.RData")
data("tpm_all_RNA_HeLa_EGF")
data("miR_data")
data("miR_info")
tpm_all_RNA=tpm_all_RNA[rowSums(tpm_all_RNA > 1) >= 1, ]
miR_info=miR_info[miR_info$ensembl_gene_id%in%rownames(tpm_all_RNA),]
miR_data=subset(miR_data,names(miR_data)%in%miR_info$mirbase_id)
res_miR=CLIPreg(Regulator_data=miR_data,gene_groups=gene_groups)
save(res_miR,file="Res_miR_HeLa_EGF.RData")
res=CLIPreg::combine(res1=res_miR,res2=res_miR,FDR=0.05)
Heatmap_no_fold_change(res=res) # need to use the Heatmap_no_fold_change
Targets=GetTarget(Regulator_data=miR_data,background=gene_groups$geneID)
Draw_network_by_group(regulators=c("hsa-mir-652","hsa-mir-499a","hsa-mir-99b","hsa-mir-875","hsa-mir-501"),res=res,Targets=Targets,gene_groups=gene_groups,n=5,forwarded = F)
Plot_GO_RBP(rbp_of_interest="hsa-mir-501",tpm_ribo=tpm_ribo,Targets=Targets,gene_groups=gene_groups,
GO_to_show=3)
```
## Pre-processing
### CLIP data preparation
```{r Encode postar processing}
# Processing of CLIPseq summary files
data("Example_bed")
Example_bed
# Step 1 : Download all RBP clip-seq bed files from encode and combine them into 1 be
# Step 2 : Filter to keep high score peaks only. >8fold enrichment and P value <10e-5
# Step 3 : Intersect bed file with Ensembl genes gtf file to find target genes
# Step 4 : Shortlist columns to keep RBP and targets
# For Postar data, download from POSTAR3 website CLIPdb
# Start from step3 of ENCODE
# The RBP_data format, obtained after processing the bed file, needs to be a list of RBPs with each RBP containing a vector of target geneIDs. For Example :
data("RBP_ENCODE")
RBP_ENCODE[c("POLR2G","PPIL4","SBDS")]
```
### Processing of miRNA file
``` {r processing miRNA file,eval=F}
# Processing of miRNA file
BiocManager::install("miRBaseConverter")
BiocManager::install("biomaRt")
library(biomaRt)
library(data.table)
library(miRBaseConverter)
library(stringr)
# miR file was downloaded from TargetScan context++
miR=fread("~/Downloads/Predicted_Targets_Context_Scores.default_predictions.txt/Predicted_Targets_Context_Scores.default_predictions.txt")
# simplifying names to make sure target scan names are the same as mirBase names for all miRNAs
index=str_sub(miR$miRNA,-1,-1)=="p"
miR$miRNA[index]=gsub('.{3}$', '',miR$miRNA[index] )
miRBase=getAllMiRNAs()
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
G_list <- getBM(filters= "mirbase_id", attributes= c("ensembl_gene_id","mirbase_id"),values=miRBase$Name,mart= mart)
index=str_sub(G_list$mirbase_id,-1,-1)=="p"
G_list$mirbase_id[index]=gsub('.{3}$', '',G_list$mirbase_id[index] )
index=str_sub(G_list$mirbase_id,-2,-2)=="-"
G_list$mirbase_id[index]=gsub('.{2}$', '',G_list$mirbase_id[index] )
index=str_sub(miRBase$Name,-1,-1)=="p"
miRBase$Name[index]=gsub('.{3}$', '',miRBase$Name[index] )
index=str_sub(miRBase$Name,-2,-2)=="-"
miRBase$Name[index]=gsub('.{2}$', '',miRBase$Name[index] )
# Keeping only targets which have a weighted context score below -0.1
Thres=-0.1
miR=miR[tolower(miR$miRNA)%in%tolower(miRBase$Name),]
miR=miR[miR$`weighted context++ score`<Thres,]
miR_names=unique(miR$miRNA)
# Saving in a format readable for CLIPreg
miR_data=list()
for (i in miR_names) {
miR_i=miR[miR$miRNA==i,]
miR_data[[i]]=unique(miR_i$`Gene ID`)
miR_data[[i]]=gsub("\\..*","",miR_data[[i]])
}
miR_data=miR_data[sort(names(miR_data))]
miR_data=subset(miR_data,tolower(names(miR_data))%in%tolower(G_list$mirbase_id))
names(miR_data)=tolower(names(miR_data))
save(miR_data,file = "data/miR_data.RData")
G_list=G_list[G_list$mirbase_id%in%c(miR_names,tolower(miR_names)),]
colnames(G_list)=c("geneID","miRBase_ID")
miR_info=G_list
miR_info$mirbase_id=tolower(miR_info$mirbase_id)
save(miR_info,file = "data/miR_info.RData")
```
### Example 2: Data pre-processing without using deltaTE
The csv files used for pre-processing can be found in the "Example" folder of CLIPreg's github page.
```{r gene groups preparation HeLa, eval=F}
BiocManager::install("biomaRt")
library(biomaRt)
# Get gene groups from existing data
file="HeLa_EGF_TE.csv"
df=read.csv(file)
sig_30=df$IDENTIFIER[df$pval_TE_30<0.05]
sig_60=df$IDENTIFIER[df$pval_TE_60<0.05]
sig_90=df$IDENTIFIER[df$pval_TE_90<0.05]
TE_genes=unique(c(sig_30,sig_60,sig_90))
df=df[df$IDENTIFIER%in%TE_genes,]
df=df[!duplicated(df$IDENTIFIER),]
gene_groups=data.frame(IDENTIFIER=df$IDENTIFIER,Gene_group=0)
for (i in 1:nrow(df)) {
ID=df$IDENTIFIER[i]
lowest_pval=which(df[i,4:6]==min(df[i,4:6]))
TE_sign=sign(df[i,lowest_pval+1])
if (TE_sign>0) {gene_groups$Gene_group[i]="TE_up"} else {gene_groups$Gene_group[i]="TE_down"}
}
# Associate geneID and IDENTIFIER
library('biomaRt')
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes <- df$IDENTIFIER
G_list <- getBM(filters= "hgnc_symbol", attributes= c("ensembl_gene_id","hgnc_symbol"),values=genes,mart= mart)
head(G_list)
colnames(G_list)=c("geneID","IDENTIFIER")
gene_groups=merge(G_list,gene_groups,by.y="IDENTIFIER")
save(gene_groups,file="data/gene_groups_HeLa_EGF.RData")
# prepare ribo_lfc data
file="HeLa_EGF_ribo_lfc.csv"
df=read.csv(file)
df[which(is.na(df),arr.ind = T)]=1
geneNames=unique(df$IDENTIFIER)
ribo_lfc=data.frame(IDENTIFIER=geneNames,Log2FoldChange=0)
for (i in 1:nrow(df)) {
lowest_pval=min(df[i,6:8])
if (lowest_pval<0.05) {
lowest_pval_index=which(df[i,6:8]==min(df[i,6:8]))[1]
ribo_lfc[ribo_lfc$IDENTIFIER==df$IDENTIFIER[i],"Log2FoldChange"]=df[i,lowest_pval_index+2]
}
}
library('biomaRt')
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes <- ribo_lfc$IDENTIFIER
G_list <- getBM(filters= "hgnc_symbol", attributes= c("ensembl_gene_id","hgnc_symbol"),values=genes,mart= mart)
head(G_list)
colnames(G_list)=c("geneID","IDENTIFIER")
ribo_lfc=merge(G_list,ribo_lfc,by.y="IDENTIFIER")
ribo_lfc=ribo_lfc[!duplicated(ribo_lfc$IDENTIFIER),]
rownames(ribo_lfc)=ribo_lfc$IDENTIFIER
save(ribo_lfc,file="data/ribo_lfc_HeLa_EGF.RData")
# TPM must be shortlistede to keep expressed genes only
file="HeLa_EGF_Ribo_TPM.csv"
df=read.csv(file)
df=df[rowSums(df[,2:7] > 1) >= 1,]
df=df[!duplicated(df$IDENTIFIER),]
library('biomaRt')
mart <- useDataset("hsapiens_gene_ensembl", useMart("ensembl"))
genes <- df$IDENTIFIER
G_list <- getBM(filters= "hgnc_symbol", attributes= c("ensembl_gene_id","hgnc_symbol"),values=genes,mart= mart)
head(G_list)
colnames(G_list)=c("geneID","IDENTIFIER")
tpm_ribo=merge(G_list,df,by.y="IDENTIFIER")
tpm_ribo=tpm_ribo[!duplicated(tpm_ribo$IDENTIFIER),]
rownames(tpm_ribo)=tpm_ribo$IDENTIFIER
save(tpm_ribo,file="data/tpm_ribo_HeLa_EGF.RData")
```