-
Notifications
You must be signed in to change notification settings - Fork 4
/
unnest_sentences.R
86 lines (78 loc) · 3.11 KB
/
unnest_sentences.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#' Split a column of text into sentences
#' @description Split a column of text into sentences
#' @param tbl dataframe containing column of text to be split into sentences
#' @param output name of column to be created to store parsed sentences
#' @param input name of input column of text to be parsed into sentences
#' @param doc_id column of document ids; if not provided it will be assumed that each row is a different document
#' @param output_id name of column to be created to store sentence ids
#' @param drop whether original input column should get dropped
#' @return A data.frame of parsed sentences and sentence ids
#' @examples
#'
#' df <- data.frame(doc_id = 1:3,
#' text = c("Testing the system. Second sentence for you.",
#' "System testing the tidy documents df.",
#' "Documents will be parsed and lexranked."),
#' stringsAsFactors=FALSE)
#'
#' unnest_sentences(df, sents, text)
#' unnest_sentences_(df, "sents", "text")
#'
#' \dontrun{
#' library(magrittr)
#'
#' df %>%
#' unnest_sentences(sents, text)
#' }
#' @export
unnest_sentences_ <- function(tbl, output, input, doc_id=NULL, output_id="sent_id", drop=TRUE) {
if(!is.data.frame(tbl)) stop("tbl must be a dataframe")
if(!(input %in% names(tbl))) stop("input column not found in tbl")
if(!is.character(tbl[[input]])) stop("input column must be character")
if(length(output_id) > 1) {
warning("only first element of output_id will be used")
output_id <- output_id[1]
}
if(!is.logical(drop)) stop("drop must be logical")
if(!is.null(doc_id)) {
if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl")
}
text <- tbl[[input]]
parsed_sents <- sentence_parser(text)
if (drop) {
tbl[[input]] <- NULL
}
tbl_out_list <- lapply(seq_along(parsed_sents), function(i) {
row_i = tbl[i,,drop=FALSE]
parsed_sent_rows_i = data.frame(sent_id = seq_along(parsed_sents[[i]]),
sents = parsed_sents[[i]],
stringsAsFactors = FALSE)
names(parsed_sent_rows_i) = c(output_id, output)
out = suppressWarnings(cbind(row_i, parsed_sent_rows_i))
names(out)[seq_along(row_i)] = names(row_i)
out
})
out_tbl = do.call('rbind', tbl_out_list)
if(!is.null(doc_id)) {
out_tbl_list = split(out_tbl, out_tbl[[doc_id]])
out_tbl_list = lapply(out_tbl_list, function(dfi) {
dfi[[output_id]] = seq_along(dfi[[output_id]])
dfi
})
out_tbl = do.call('rbind', out_tbl_list)
}
rownames(out_tbl) = NULL
return(out_tbl)
}
#' @rdname unnest_sentences_
#' @export
unnest_sentences <- function(tbl, output, input, doc_id=NULL, output_id='sent_id', drop=TRUE) {
output_str <- as.character(substitute(output))
input_str <- as.character(substitute(input))
out_id_str <- as.character(substitute(output_id))
doc_id <- as.character(substitute(doc_id))
if (length(doc_id) == 0) doc_id = NULL
unnest_sentences_(tbl=tbl, output = output_str,
input = input_str, doc_id = doc_id,
output_id = out_id_str, drop = drop)
}