-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy path03-retractors.R
101 lines (82 loc) · 2.96 KB
/
03-retractors.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#' ---
#' title: "Retractors"
#' author: "Rebekah Brown / Bob Rudis"
#' date: ""
#' output:
#' html_document:
#' keep_md: true
#' theme: simplex
#' highlight: monochrome
#' ---
#+ init, include=FALSE
knitr::opts_chunk$set(message = FALSE, warning = FALSE, echo = FALSE,
dev="png", fig.retina = 2, fig.width = 10, fig.height = 6)
#+ libs
library(stringi)
library(ggalt)
library(knitr)
library(viridis)
library(tidytext) # devtools::install_github("juliasilge/tidytext")
library(hrbrthemes)
library(tidyverse)
#+ data, cache=TRUE
# Read in the documents into a data frame
list.files("source-docs", pattern=".*txt$", full.names=TRUE) %>%
map_df(~{
data_frame(
doc = tools::file_path_sans_ext(tools::file_path_sans_ext(basename(.x))),
text = read_lines(.x) %>% paste0(collapse=" ") %>% stri_trans_tolower()
)
}) %>%
mutate(text = stri_replace_all_regex(text, "[[:punct:]]", "")) %>%
mutate(doc_id = substr(doc, 1, 30)) -> corpus
# Get rid of words with numbers
unnest_tokens(corpus, word, text,) %>%
filter(!stri_detect_regex(word, "[[:digit:]]")) -> one_grams
count(one_grams, doc_id) %>%
rename(total_words = n) -> total_words
word_list <- read_lines("lists/retractors.csv")
map_df(word_list, ~{
group_by(corpus, doc_id) %>%
summarise(keyword = .x, ct = stri_count_regex(text, sprintf("\\W%s\\W", .x)))
}) %>%
mutate(doc_num = as.character(as.numeric(factor(doc_id)))) %>%
mutate(ct = ifelse(ct == 0, NA, ct)) -> retractor_df
#' ## ID to Doc Name mapping:
distinct(retractor_df, doc_id, doc_num) %>%
left_join(total_words) %>%
kable()
#' ## Overall retractor word frequency per document
#+ overall_retractor_freq_summary
count(retractor_df, doc_id, wt=ct) %>%
mutate(doc_num = as.character(as.numeric(factor(doc_id)))) %>%
left_join(total_words) %>%
mutate(pct = n/total_words) %>%
ggplot(aes(doc_num, pct)) +
geom_segment(aes(xend=doc_num, yend=0), size=5, color="lightslategray") +
scale_y_percent() +
labs(
x="Document #", y=NULL,
title="Percent of 'retractor' words of total words in document corpus"
) +
theme_ipsum_rc(grid="Y")
#' ## General frequency per document
#+ retractor_frequency, fig.height=8
ggplot(retractor_df, aes(doc_num, keyword, fill=ct)) +
geom_tile(color="#2b2b2b", size=0.125) +
scale_x_discrete(expand=c(0,0)) +
scale_y_discrete(expand=c(0,0)) +
viridis::scale_fill_viridis(direction=-1, na.value="white") +
labs(x=NULL, y=NULL, title='"Retractor" Frequency') +
theme_ipsum_rc(grid="")
#' ## Normalized frequency per document
#+ retractor_frequency_normalized, fig.height=8
left_join(retractor_df, total_words) %>%
mutate(pct = ct/total_words) %>%
ggplot(aes(doc_num, keyword, fill=pct)) +
geom_tile(color="#2b2b2b", size=0.125) +
scale_x_discrete(expand=c(0,0)) +
scale_y_discrete(expand=c(0,0)) +
viridis::scale_fill_viridis(direction=-1, na.value="white") +
labs(x=NULL, y=NULL, title='"Retractor" Frequency (normalized)') +
theme_ipsum_rc(grid="")