-
Notifications
You must be signed in to change notification settings - Fork 2
/
linnarson_processing.R
66 lines (52 loc) · 2.58 KB
/
linnarson_processing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
library(homologene)
library(readr)
library(dplyr)
library(tidyr)
library(reshape2)
library(magrittr)
library(here)
getwd()
table <- read_tsv(here('./l5_all.agg.tab'), col_names = FALSE)
descriptions <- table[29, 9:273]
table %<>% filter(!is.na(X1) | X8 == 'ClusterName')
colnames(table) <- c(unlist(table[2,1:7]), unlist(table[1, 8:ncol(table)]))
table <- table[-c(1,2),]
# columns to drop
table <- table[!is.na(names(table))]
dropcols <- c('Accession', '_LogCV', '_LogMean', '_Selected', '_Total', '_Valid', 'ClusterName')
table %<>% select(-one_of(dropcols))
#to change all but the 'Gene' column name into numeric type
table[,2:length(colnames(table))] %<>% lapply(function(x) as.numeric(as.character(x)))
cell_type_info <- rbind(colnames(table[2:266]), descriptions)
cell_type_info <- t(cell_type_info)
colnames(cell_type_info) <- c('cluster_id', 'description')
cell_type_info <- as_tibble(cell_type_info)
write_csv(cell_type_info, path = "celltype_descriptions.csv")
#now melt
linnarsson <- as_tibble(reshape2::melt(table, id.vars='Gene', variable.name='cluster_id', value.name='expression'))
linnarsson %<>% mutate(log1Expression=log(1+expression))
# zscore across genes
linnarsson %<>%
group_by(Gene) %>%
mutate(log1ExpZ = (log1Expression - mean(log1Expression)) / sd(log1Expression))
print(dim(linnarsson))
linnarsson %<>% filter(!is.na(log1ExpZ))
print(dim(linnarsson))
linnarsson %<>% select(-expression, -log1Expression)
#some genes are duplicated - just average them
linnarsson %<>% group_by(Gene, cluster_id) %>% summarize(log1ExpZ = mean(log1ExpZ))
linnarsson %<>% group_by(cluster_id)
linnarsson %<>% mutate(log1ExpZRank = rank(log1ExpZ)) %>% select(-log1ExpZ)
#test <- spread(linnarsson, cluster_id, log1ExpZRank)
linnarssonMatrixMouse <- dcast(linnarsson, Gene ~ cluster_id, value.var = "log1ExpZRank")
rownames(linnarssonMatrixMouse) <- linnarssonMatrixMouse$Gene
linnarssonMatrixMouse$Gene <- NULL
#repeat code again after filtering for mouse genes that can be reached via human symbols (should be refactored)
unique_genes_all <- rownames(linnarssonMatrixMouse)
unique_genes_human_reachable <- mouse2human(unique_genes_all)$mouseGene
linnarsson %<>% filter(Gene %in% unique_genes_human_reachable)
linnarsson %<>% mutate(log1ExpZRank = rank(log1ExpZRank)) #rerank after filtering
linnarssonMatrixHumanReachable <- dcast(linnarsson, Gene ~ cluster_id, value.var = "log1ExpZRank")
rownames(linnarssonMatrixHumanReachable) <- linnarssonMatrixHumanReachable$Gene
linnarssonMatrixHumanReachable$Gene <- NULL
save(linnarssonMatrixMouse, linnarssonMatrixHumanReachable, file='processed_zeisel.Rdata')