-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse_star_output
160 lines (141 loc) · 6.28 KB
/
parse_star_output
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env Rscript
#gratefully stolen from github
library(tidyverse)
#' Parse sample from file names
#'
#' Sample names are parsed from file names without the extension. If the file name is not unique,
#' the parent directory is used.
#'
#' @param files file name with path
#'
#' @return A vector
#'
#' @author Chris Stubben
#'
#' @examples
#' # File name or parent directory should be unique
#' extract_samples(c("align1/1355X1.counts", "align2/1355X2.counts"))
#' extract_samples(c("align1/1355X1/Log.out", "align2/1355X2/Log.out"))
#' @export
extract_samples <- function(files){
## capture sample in file name or path
x <- lapply( strsplit(files, "/"), rev)
samples <- sapply(x, "[", 1 )
# remove file extension
samples <- gsub("\\..*", "", samples)
if( any(duplicated(samples )) ){
# use parent directory, 13555X2/Log.final.out
samples <- sapply(x, "[", 2 )
}
if( any(duplicated(samples )) ){
stop("Sample names are not unique: \n ", paste(files, collapse="\n "), call.=FALSE)
}
samples
}
#' Read and combine common output files
#'
#' Read output files and parse the file name to add sample IDs in the first column
#'
#' @param path the path to output files, the default corresponds to the working directory.
#' @param pattern regular expression for file name matching
#' @param delim separator in output files, default table
#' @param \dots additional options such as col_names passed to \code{read_delim}.
#'
#' @note Sample names are parsed from file names without extensions. If the file name is not unique,
#' the parent directory is used. Requires tibble > version 1.2 to avoid error in add_column
#'
#' @return A list with coverage and stats data.frames
#'
#' @author Chris Stubben
#'
#' @examples
#' \dontrun{
#' #FeatureCounts summary (second column name with *.bam is always unique, so skip and assign)
#' fc <- read_sample_files(".summary$", skip=1, col_names=c("status", "count"))
#' filter(fc, count!=0) %>%
#' hchart("bar", x=sample, y=count, group=status) %>%
#' hc_plotOptions(bar = list(stacking = "normal"))
#' }
#' @export
read_sample_files <- function(path=".", pattern="\\.counts$", delim="\t", ...){
outfiles <- list.files(path, pattern, recursive=TRUE, full.names=TRUE)
outfiles = outfiles[!grepl("pass1",outfiles)]
if(length(outfiles) == 0) stop("No ", pattern, " files found in ", path, call.=FALSE)
samples <- extract_samples(outfiles)
out1 <- vector("list", length(outfiles))
for(i in seq_along(outfiles)){
message("Reading ", outfiles[i])
x <- suppressMessages( readr::read_delim(outfiles[i], delim=delim, ...) )
## requires tibble > 1.2 (to add single name into column)
out1[[i]] <- tibble::add_column (x, sample= samples[i], .before=1)
}
dplyr::bind_rows(out1)
}
#' Read STAR log files
#'
#' Read STAR Log.final.out files and optionally reshape into wide format.
#'
#' @param path the path to STAR log files, the default corresponds to the working directory.
#' @param pattern regular expression for file name matching, default .final.out
#' @param reshape reshape percent mapping into wide format with samples in rows
#'
#' @note Reading output files requires a unique sample identifier in either the file name or parent directory
#'
#' @return A tibble
#'
#' @author Chris Stubben
#'
#' @examples
#' \dontrun{
#' x <- read_STAR( pattern=".star.out$")
#' reads <- c("Uniquely mapped", "Mapped to multiple loci",
#' "Mapped to too many loci", "Unmapped reads")
#' y <- filter(x, stat %in% reads) %>% mutate( stat= factor(stat, levels=reads ))
#' hchart(y, "bar", x=sample , y=value, group=stat ) %>%
#' hc_plotOptions(bar = list(stacking = "normal")) %>%
#' hc_colors( c('#437bb1', '#7cb5ec', '#f7a35c', '#b1084c') ) %>%
#' hc_yAxis(reversedStacks = FALSE)
#' read_STAR( pattern=".star.out$", reshape=TRUE)
#' }
#' @export
read_STAR <- function( path=".", pattern, reshape=FALSE){
if(missing(pattern)) pattern <- "\\.final.out$"
# suppress warnings about subheaders.... Warning: 4 parsing failures.
x <- suppressWarnings( read_sample_files(path, pattern, col_names=c("stat", "value") , skip=5, trim_ws=TRUE ))
x <- dplyr::filter(x, !is.na(value)) %>% # drop subheader with NA value ... UNIQUE READS:
dplyr::mutate( stat = gsub(" \\|$", "", stat), # remove pipe from end of statistic
stat = gsub("Number of reads m", "M", stat), # shorten Number of reads mapped to...
stat = gsub(" reads number", "", stat), # shorten Uniquely mapped reads number
value = gsub("%", "", value), # drop % from value
value = as.numeric(value)) # change value to numeric
# ADD unmapped reads -
mapped <- c("Uniquely mapped", "Mapped to multiple loci", "Mapped to too many loci" )
# see http://stackoverflow.com/questions/40749742/add-missing-subtotals-to-each-group-using-dplyr
x <- dplyr::group_by(x, sample) %>%
dplyr::summarize( value = value[stat == 'Number of input reads'] - sum(value[stat %in% mapped ]),
stat = 'Unmapped reads' ) %>%
dplyr::bind_rows(x) %>%
dplyr::select(sample, stat, value) %>% # back to original column order
dplyr::arrange(sample)
## split unmapped reads into too many mismatches, too short and other?
# divide %unmapped too short by total %unmapped and muliply by Unmapped reads to get estimate
if(reshape){
n <- c(mapped, "Unmapped reads", "Number of input reads" )
x <- dplyr::filter(x, stat %in% n) %>%
dplyr::mutate( stat = factor(stat, levels= n)) %>%
tidyr::spread( stat, value) %>%
dplyr::mutate_each( dplyr::funs(as.integer ) , -1)
}
x
}
library("optparse")
option_list = list(
make_option(c("-d", "--STAR"), type="character", default=NULL,
help="a folder containing STAR.out logs", metavar="character"),
make_option(c("-o", "--out"), type="character", default="out.txt",
help="output file name", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);
output = janitor::clean_names(read_STAR(path = opt$STAR,reshape = TRUE))
data.table::fwrite(output, opt$out)