-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpool-data.R
107 lines (86 loc) · 3.89 KB
/
pool-data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env Rscript
suppressPackageStartupMessages(library("optparse"))
suppressMessages(suppressWarnings(suppressPackageStartupMessages(library(tools, quietly=TRUE, verbose=FALSE, warn.conflicts=FALSE))))
source('~nikolas/bin/FCS/fcs.R')
# Simply pools all files together but in the future may decide to do normalisation before pooling.
option_list <- list(
make_option(c("-f","--in.file"),
help = 'List of files in a csv file. Files may be RData, csv or fcs.'),
make_option(c('-T','--total'), default=1E5,
help='Total number of events in resulting pooled file.'),
make_option(c('--channels'), default=NULL,
help='Channels on which to do the clustering. By default select all channels.'),
make_option(c('--plot.file'), default=NULL,
help='Plot pooled file.'),
make_option(c('-o','--out.file'),
help='Pooled output file.')
)
option.parser <- OptionParser(option_list=option_list)
opt <- parse_args(option.parser)
# do checks first
if (is.null(opt$in.file) || !file.exists(opt$in.file)) stop("No input file specified on command line or file does not exist.")
if (!is.null(opt$channels)) {
channels <- unlist(strsplit(opt$channels, ","))
} else {
channels <- NULL
}
# expect a list of fcs files in the csv file
fcs.names <- as.character(read.csv(opt$in.file,header=FALSE)[,1])
# will have to randomly sample N from each fcs file to obtain a pooled file with a total of T events
# round down
print(N <- floor(opt$total/length(fcs.names)))
cat('Pooling', N, 'lines, from each of the following', length(fcs.names), 'files on the channels', channels, '( total=', nrows <- N*length(fcs.names), ')', '\n')
fcs.filter <- function(fcs) {
#remove min & max scatter
min.ssc <- min(fcs[,'SSCA'])
max.ssc <- max(fcs[,'SSCA'])
fcs <- fcs[min.ssc < fcs[,'SSCA'] & fcs[,'SSCA'] < max.ssc,]
min.fsc <- min(fcs[,'FSCA'])
max.fsc <- max(fcs[,'FSCA'])
fcs <- fcs[min.fsc < fcs[,'FSCA'] & fcs[,'FSCA'] < max.fsc,]
fcs <- fcs[which(fcs[,'FSCA'] > 2),]
fcs <- fcs[which(fcs[,'SSCA'] > 2),]
return(fcs)
}
pooled.data <- do.call('rbind', lapply(fcs.names, function(fcs.name) {
cat('Reading', fcs.name, 'extension', file.ext <- tolower(file_ext(fcs.name)), '\n')
if (file.ext=='fcs') {
fcs.data <- getChannels(read.FCS(fcs.name, filter=TRUE, channels=channels, which.lines=N), channels=channels)
} else if (file.ext=='csv') {
fcs.data <- read.csv(fcs.name)[1:N,channels]
} else if (file.ext=='rdata') {
print(load(fcs.name))
#result
fcs.data <- data.frame(fcs.data)
#fcs.data <- fcs.filter(fcs.data)
fcs.data <- fcs.data[1:N,channels]
}
fcs.data[,'fcs.name'] <- basename(fcs.name)
print(head(fcs.data))
return(fcs.data)
}))
print(dim(pooled.data))
print(head(pooled.data))
print(tail(pooled.data))
fcs.data <- pooled.data
save(fcs.data, file=opt$out.file, compress='xz', compression_level=9)
#d <- list(fcs.names=fcs.names, data=pooled.data)
#save(d,file=opt$out.file)
png(opt$plot.file)
pairs.smoothScatter(pooled.data[,channels])
dev.off()
###
pool.data2 <- function(d, down.sample, channels)
#pool data
#d <- getChannels( do.call('rbind', lapply(d, function(x) getChannels(x[sample(1:nrow(x), down.sample),]))), channels )
#d <- lapply(d, function(x) getChannels(x[sample(1:nrow(x), down.sample),], channels))
#way too slow!
#dens <- lapply(d, compute.density, channels=channels)
getChannels( do.call('rbind',
lapply(d, function(x) {
#calculate pairwise density
table(f <- density.filter(x, channels, '25%'))
x <- getChannels(x[f,], channels)
return(x)
}) ), channels)
#pool.data <- function(d, down.sample, channels) return(getChannels( do.call('rbind', lapply(d, function(x) getChannels(x[sample(1:nrow(x), down.sample),]))), channels ))