-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathCombinedLRT.R
168 lines (137 loc) · 5.48 KB
/
CombinedLRT.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
#!/usr/bin/env Rscript
# Scripts for calculating combined likelihood ratio
library(Seurat)
library(tidyverse)
library(BiocParallel)
# Load environment
MinMax <- function(data, min, max) {
data2 <- data
data2[data2 > max] <- max
data2[data2 < min] <- min
return(data2)
}
# Secondary function to address values that don't fit in the model
bimodLikData_FixNoVar <- function(x, xmin = 0) {
x1 <- x[x <= xmin]
x2 <- x[x > xmin]
xal <- MinMax(
data = length(x = x2) / length(x = x),
min = 1e-5,
max = (1 - 1e-5)
)
likA <- length(x = x1) * log(x = 1 - xal)
#likelihood of positivec cells
likB <- length(x = x2) * log(x = xal)
return(likA + likB)
}
#internal function to run mcdavid et al. DE test
#
#' @importFrom stats sd dnorm
#
bimodLikData <- function(x, xmin = 0) {
# x1 and x2 are 2 vectors representing 2 modes
# x1 for 0 values -> on/off distribution model
# x2 for positive values -> normal, continuous distribution
x1 <- x[x <= xmin]
x2 <- x[x > xmin]
# estimate proportion of positive cells
# use 1e-5 as min and 1-1e-5 as max (i.e. if there is only 1 nonzero among 100K cells)
xal <- MinMax(
data = length(x = x2) / length(x = x),
min = 1e-5,
max = (1 - 1e-5)
)
# likelihood for observing x1, 1-xal is expected ratio of 0 values
likA <- length(x = x1) * log(x = 1 - xal)
# calculate variance for x2, to be used in dnorm to calculate prob distribution
if (length(x = x2) < 2) {
mysd <- 1
} else {
mysd <- sd(x = x2)
}
# Likelihood for observing x2
likB <- length(x = x2) *
log(x = xal) +
sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE))
return(likA + likB)
}
# x = counts for test gene
# y = counts for control gene
DifferentialLRT <- function(x, y, xmin = 0) {
lrtX <- bimodLikData(x = x)
lrtY <- bimodLikData(x = y)
lrtZ <- bimodLikData(x = c(x, y))
lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
# Check to account for results that do not conform to expected model
if (is.infinite(lrt_diff) || (lrt_diff < 0) || is.nan(lrt_diff) || is.na(lrt_diff)){
lrtX <- bimodLikData_FixNoVar(x = x)
lrtY <- bimodLikData_FixNoVar(x = y)
lrtZ <- bimodLikData_FixNoVar(x = c(x, y))
lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
}
return(pchisq(q = lrt_diff, df = 3, lower.tail = F))
}
# Wrapper to all the fancy stuff
runCombinedLRT <- function(gene, test_matrix = NULL, control_matrix = NULL){
# Retrieve counts
test_counts <- as.numeric(unlist(test_matrix[gene, ]))
control_counts <- as.numeric(unlist(control_matrix[gene, ]))
# Perform Combined LRT
de.result <- DifferentialLRT(test_counts, control_counts)
return(de.result)
}
worker <- function(gene_list, test_matrix = NULL, control_matrix = NULL){
# Calculate p-values
p_vals <- lapply(gene_list, runCombinedLRT, test_matrix = test_matrix, control_matrix)
names(p_vals) <- gene_list
p_vals <- unlist(p_vals)
padj_vals <- stats::p.adjust(p_vals, method = "BH")
# Calculate mean expression
test_means <- Matrix::rowMeans(test_matrix[gene_list, ])
control_means <- Matrix::rowMeans(control_matrix[gene_list, ])
# FoldChange
foldChange <- test_means/control_means
log2FoldChange <- log2(foldChange)
output_df <- data.frame(gene_id = gene_list,
test_mean = test_means,
control_mean = control_means,
pval = p_vals, padj = padj_vals,
foldChange = foldChange,
log2FoldChange = log2FoldChange)
return(output_df)
}
# MC DAVID TEST
# Discrete/continuous model for single cell expression data based on a mixture
# a point mass at zero and log-normal distribution
# Use log normal counts
# LRT
# Load prepared Seurat object
seurat_obj <- readRDS("/Volumes/LACIE/CROP-seq/ARRAY_AGGR_OBJ.rds")
# Retrieve metadata from Seurat object
metadata <- FetchData(seurat_obj, vars = c("ARRAY", "TYPE", "gRNA", "TARGET"))
# Add your targets here
target <- c("NonTargeting-Human-0003-gene", "NonTargeting-Human-0004-gene")
# Check if we are testing a target - if so, exclude it from the control pool
if (all(startsWith(target, "NonTargeting"))){
control_guides <- grep("^NonTargeting", metadata$TARGET, value = TRUE)
control_guides <- unique(control_guides[which(!(control_guides%in% target))])
control_info <- subset(metadata, metadata$TARGET %in% control_guides)
} else{
control_info <- subset(metadata, metadata$TARGET == "NonTargeting")
}
# Subset out groups that we are going to test
test_info <- subset(metadata, metadata$TARGET %in% target)
# Use UMI-corrected count data and split into two matrices
normcounts <- seurat_obj[["SCT"]]@counts
test_counts <- normcounts[ , rownames(test_info)]
control_counts <- normcounts[, rownames(control_info)]
# Remove guide-associated genes from query list
gene_names <- rownames(normcounts)[which(!(rownames(normcounts) %in% grep("-gene$", rownames(normcounts), value = TRUE)))]
chunked_genes <- split(gene_names, seq(1,4))
test_results <- lapply(chunked_genes, worker, test_matrix = test_counts, control_matrix = control_counts)
de_results <- do.call("rbind", test_results)
de_results <- de_results[order(abs(de_results$log2FoldChange), decreasing = TRUE), ]
de_results <- de_results %>% filter(!(is.infinite(log2FoldChange)))
de_results <- de_results %>% mutate(target = paste(target, collapse = "_"))
de_results <- de_results %>% dplyr::select(gene_id, target, everything())
write_tsv(de_results, sprintf("%s_CROPseq_DE.tsv", paste(target, collapse = "_")))