-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmakeData.R
130 lines (86 loc) · 3.79 KB
/
makeData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
library(sentimentr)
library(data.table)
#function for converting numbers to words
source(url("https://gist.githubusercontent.com/psychemedia/150cb9901529da58124a/raw/a12bfd600af11065255f39787c965b7a4be086d0/numbers2words.R"))
trumptweets_text <- as.data.frame(fread("./data/trumptweets_16-09-19_text.txt", sep = NULL))
trumptweets_id <- read.csv("./data/trumptweets_16-09-19_id.txt", stringsAsFactors = FALSE, sep = "ø",
header = TRUE, colClasses = "character")
trumptweets <- cbind(trumptweets_text, trumptweets_id)
#drop date
trumptweets <- trumptweets[, c("text", "id_str")]
#problem with reading last line. We just drop it:
trumptweets <- trumptweets[-nrow(trumptweets), ]
#trumptweets$ID <- 1:nrow(trumptweets)
#change \n to whitespace
trumptweets$text <- gsub("\n", " ", trumptweets$text)
#remove entries only consisting of links
trumptweets <- trumptweets[!(trumptweets$text %in% c("", " ")),]
#remove @ and #
trumptweets$text <- gsub("@", "", trumptweets$text)
trumptweets$text <- gsub("#", "", trumptweets$text)
#replace & with &
trumptweets$text <- gsub("&", "&", trumptweets$text)
#split into sentences
#library(tokenizers)
#trump <- data.frame(sentence = unlist(tokenize_sentences(trumptweets$text)),
# stringsAsFactors = FALSE)
allsentences <- get_sentences(trumptweets$text)
trump <- data.frame(sentence = unlist(get_sentences(trumptweets$text)),
stringsAsFactors = FALSE,
id = rep(trumptweets$id_str, times = sapply(allsentences, function(x) sum(length(x))))
)
#remove links
#at end of line
trump$sentence <- gsub("https://.{+}$", "", trump$sentence)
#not at end of line
trump$sentence <- gsub("https://.{+}[[:blank:]]", "", trump$sentence)
#count number of syllables in each sentence
#do it in loop in order to add more information if sentence is possibly miscounted,
#and to manually fix problems due to miscounted numbers
library(quanteda)
trump$syllables <- NA
for (i in 1:nrow(trump)) {
thisSentence <- trump$sentence[i]
thisCount <- nsyllable(thisSentence)
theseWords <- NULL
#deal with numbers
if (grepl("[[:digit:]]", thisSentence)) {
#split into words
theseWords <- tokens(thisSentence)$text1
numWords <- as.numeric(theseWords[grepl("[[:digit:]]", theseWords)])
numCounts <- sum(nsyllable(numbers2words(numWords)))
#note: numbers are counted as zero (NA) in nsyllable
thisCount <- thisCount + numCounts
}
#add one to syllable count for "Mr. "
if (grepl("Mr.", thisSentence)) {
if (is.null(theseWords)) {
theseWords <- tokens(thisSentence)$text1
}
mrWords <- sum("Mr." == theseWords)
#Add one to syllable count for every "Mr." occurance
thisCount <- thisCount + mrWords
}
trump$syllables[i] <- thisCount
}
#trump$syllables <- nsyllable(trump$sentence)
#drop sentences that are empty:
trump <- trump[!is.na(trump$sentence),]
trump <- trump[!(trump$sentence %in% c("", " ")),]
#drop sentences with syllables > 7
trump <- trump[is.na(trump$syllables) | trump$syllables <= 7,]
save(list = "trump", file = "trump_wduplicates.rda")
#remove duplicates
trump <- trump[!duplicated(trump$sentence), ]
save(list = "trump", file = "trump_beforemanualsylcount.rda")
load("trump_aftermanualsylcount.rda")
#drop sentences with syllables not in {5,7}
trump <- trump[trump$syllables %in% c(5,7),]
#add ID number
#trump$ID <- as.character(1:nrow(trump))
#add sentiment score
sentiments <- as.data.frame(sentiment(trump$sentence))
sentiments <- sentiments[!is.na(sentiments$word_count),]
trump$sentimentscore <- sentiments$sentiment
#save data
save(list = "trump", file = "./data/trumpData.rda")