3_text_genderAnalysis.Rmd

---
title: "Text analysis: title and abstract of male and female speakers"
author: "Melina Leite & Júlia Barreto"
date: "`r format(Sys.time(), '%d de %B de %Y')`"
output:
  rmdformats::readthedown:
    highlight: kate
    self_contained: true
    thumbnails: false
    lightbox: true
    gallery: false
  pdf_document:
    highlight: tango
    toc: yes
---
```{r setup, echo=FALSE, warning=FALSE, message=FALSE}
library(knitr)
library(tidyverse); library(cowplot); library(patchwork);library(ggbeeswarm)
theme_set(theme_cowplot())
library(janitor)
library(tidytext)
library(scales); library(ggrepel)
library(lubridate)
library(tidytext)
library(topicmodels)
library(tm); 
library(quanteda); library(quanteda.textplots)

opts_chunk$set(fig.align = 'center', warning = FALSE, message = FALSE, error = FALSE, echo=T, cache=F)
options(formatR.arrow = TRUE, width = 90, help_type = "html")
```

# Data

Data description and summary statistics in script `0_data_summary`.

```{r}
load("data_clean/data_title.Rdata")
data <- data_t
```

Using data from titles and abstracts.

Formating titles  
```{r}
tit <- data %>% dplyr::select(id,gender,position_cat, audience_n,
                              title_english) 
text_tit <- tit %>% unnest_tokens(output=word,
                                   input=title_english)
```

Total of `r dim(tit)[1]` titles.

Number of titles per group
```{r}
table(tit$gender) %>% kable()
table(tit$position_cat,tit$gender) %>% kable()
```


Formating abstracts
```{r}
data_abs <- data  %>% filter(!is.na(abstract_english)) 
text_tok <- data %>% dplyr::select(id,gender,position_cat, audience_n,
                             abstract_english) %>%
  unnest_tokens(output=word,input=abstract_english)
```

Total of `r dim(data_abs)[1]` abstracts.

Number of abstracts per group
```{r}
table(data_abs$gender) %>% kable()
table(data_abs$position_cat,data_abs$gender)%>% kable()
```


Tidytext
```{r}
texts <- bind_rows(text_tit, text_tok)

stop_w <- tibble(word = stopwords("en"))

# remove stopwords
text <- texts %>% 
  anti_join(stop_w, by="word") %>% arrange(word) 

# remove other non-words (numbersm characters) and stopwords
text <- text %>% dplyr::slice(-c(1:281)) %>% # number and some symbols
          filter(nchar(word)!=1) %>% # letters alone
          filter(!word %in% c("mpas", "δ13c", "β", "can", "aff", "agb", "al",
                              "and","são"))

# solving some simple plurals
plural <- c("actions","advances", "adaptations", "amphibians", "animals", 
            "ants","anurans",
            "applications","approaches", "bees","builds", "birds",
            "cerrados","challenges", "biologists", "captures",
            "continents","crops", 
            "decisions","declines","determines","determinants", "defenses",
            "dynamics", "dunnocks",
            "economics", "ecosystems","environments", "experiences",
            "forests", "fruits", "fathers",
            "genetics","gifts","gradients","guides","impacts", "islands",
            "increases","interactions", "jobs",  "lives", "insects",
            "landscapes","males","mammals", "mangroves","models","movements",
            "mutualisms","networks","neotropics",
            "opilions","phenotypes","plants","projects","paths", "perspectives",
            "purposes", "populations","promotes","relationships", "relations",
            "resources","responses","roads","services","skulls","snakes","seeds",
            "soils", "spaces", "spiders","stages", "trees", "variations",
            "threats")

text$word[text$word %in% plural] <- 
  substr(text$word[text$word %in% plural],
       1,nchar(text$word[text$word %in% plural])-1)
```


Grouping similar words:

```{r}
lemma <- rbind(c("adaptive", "adaptation"),
               c("advancement", "advance"),
               c("agricultural", "agriculture"),
               c("agro", "agriculture" ),
               c("amazonia","amazon" ),
               c("amazonian","amazon" ),
               c("andean","andes"),
               c("apply","application"),
               c("applying","application"),
               c("approaches", "approach"),
               c("apidae","apis"),
               c("arachnida","arachnid"),
               c("argue","argument"),
               c("basal", "basis"),
               c("behavioral","behavior"),
               c("behavioural","behavior"),
               c("bignonieae", "bignoniaceae"),
               c("biological", "biology"),
               c("brazilian","brazil"),
               c("building","build"),
               c("butterflies", "butterfly"),
               c("changing", "change"),
               c("cnidarian", "cnidaria"),
               c("coastal","coast"),
               c("colour", "color"),
               c("colors", "color"),
               c("communities","community" ),
               c("competitive", "competition"),
               c("complexity", "complex"),
               c("convergences", "convergence"),
               c("convergent", "convergence"),
               c("croplands","crop"),
               c( "cultural", "culture"),
               c("darwin's", "darwin"),
               c("darwinian", "darwin"),
               c("defensive", "defense"),
               c("dependent","dependence"),
               c("detecting","detection"),
               c("determine", "determinant"),
               c("developmental", "development"),
               c("dispersers","dispersal"),
               c("disturbed", "disturbance"),
               c("diversification", "diversity"),
               c("dragonflies", "dragonfly"),
               c("drier", "drought"),
               c("ecological", "ecology"),
               c("ecologists", "ecology"),
               c("endemic", "endemism"),
               c("effectiveness", "efficiency"),
               c("environmental", "environment"),
               c("evolutionary", "evolution"),
               c("expanding", "expansion"),
               c("extinct", "extinction"),
               c("facilitate", "facilitation"),
               c("fisheries", "fishery"),
               c("floral", "flora"),
               c("floristic", "flora"),
               c("forested", "forest"),
               c("functional", "function"),
               c("functionally", "function"),
               c("functioning", "function"),
               c("geographical", "geographic"),
               c("heterogeneties", "heterogeneity"),
               c("heterogeneous", "heterogeneity"),
               c("histories", "history"),
               c("integrated", "integration"),
               c("intregating", "integration"),
               c("integrative", "integration"),
               c("invasive", "invasion"),
               c("isotopic", "isotope"),
               c("linking", "link"),
               c("living", "live"),
               c("mammalia", "mammal"),
               c("managed", "manage"),
               c("managers", "manage"),
               c("mathematical", "mathematics"),
               c("mates", "mating"),
               c("mediated", "mediate"),
               c("mechanistic", "mechanism"),
               c("matrices", "matrix"),
               c("migratory", "migration"),
               c("mimicking", "mimicry"),
               c("modeling", "model"),
               c("mutualistic", "mutualism"),
               c("natural", "nature"),
               c("neotropical", "neotropic"),
               c("northeastern", "northeast"),
               c("occuring", "occur"),
               c("onça", "onca"),
               c("opiliones", "opilion"),
               c("parasite", "parasitism"),
               c("parent", "parenting"),
               c("phylogenies", "phylogeny"),
               c("phylogenetic", "phylogeny"),
               c("phylogenomic", "phylogeny"),
               c("pollinators", "pollination"),
               c("protected", "protect"),
               c("protective", "protect"),
               c("rainfall", "rain"),
               c("reconstructing", "reconstruction"),
               c("regulatory", "regulation"),
               c("regulates", "regulation"),
               c("relation", "relationship"),
               c("reproductive", "reproduction"),
               c("restored", "restoration"),
               c("robustness", "robust"),
               c("scientific", "science"),
               c("scientist", "science"),
               c("sexy", "sexual"),
               c("simulated", "simulation"),
               c("societies", "society"),
               c("social", "society"),
               c("socio", "society"),
               c("space", "spatial"),
               c("spacio", "spatial"),
               c("stabilize", "stability"),
               c("stable", "stability"),
               c("stories", "story"),
               c("strategic", "strategy"),
               c("strategies", "strategy"),
               c("structured", "structure"),
               c("structuring", "structure"),
               c("studies", "study"),
               c("studing", "study"),
               c("sustainable", "sustainability"),
               c("theories", "theory"),
               c("theoretical", "theory"),
               c("threatened", "threat"),
               c("tropical", "tropic"),
               c("vision", "visual")
               )
lemma <- as.data.frame(lemma)

for (i in 1:dim(lemma)[1]){
  text$word[text$word == lemma[i,1]] <- lemma[i,2]
}
```


# WORDS - all data

Number of words per gender and academic level

```{r}
table(text$gender) %>% kable()
table(text$position_cat ,text$gender)%>% kable()
```


20 more common workds
```{r}
text %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
```

## Word cloud

All words
```{r}
textplot_wordcloud(x=dfm(tokens(text$word)))
```

By gender. Purple female, yellow male
```{r}
par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(text$word[text$gender=="M"])),
                   col="#FCA532")
```


## Word frequencies by gender

```{r}
props <- text %>%
  count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
props$label <- NA
props$label[1:20] <- props$word[1:20]
```

```{r, echo=F}
ggplot(props, aes(x=proportion_M,, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
  geom_point(size=1, alpha=0.1)+
 # geom_jitter(size=2.5, alpha=0.02)+
  geom_text(aes(label=word), check_overlap = T,vjust=1,hjust=0.32)+
  #geom_text_repel(aes(label=label), size=3.2)+
  #xlim(0,1)+ylim(0,0.2)+
  scale_x_log10(name="Male most used words", limits=c(7.446570e-05,0.016),
                labels = percent_format()) +
  scale_y_log10(name="Female most used words",
                labels = percent_format()) +
  scale_color_gradient(name="Absolute \n difference",low = "blue", high = "red",
                       labels=percent_format()
                       )  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))
 # geom_smooth(method="lm")
ggsave("figures/FIG_4_wordFreq.jpg", height = 7, width=9)
```


Words that are close to the dashed line have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females. Differences above 0.3% are also indicated in text.

Correlation of word frequeency use between gender:
```{r}
cor.test(props$proportion_F, props$proportion_M)
```

Highly correlated -> it means they tend to use the same frequency of main word

20 words with the largest differences in frequency

```{r}
prop2 <- props %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(prop2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))
```

## TF IDF

> The statistic tf-idf is intended to measure how important a word is to a document in a collection (or corpus) of documents, for example, to one novel in a collection of novels or to one website in a collection of websites.

>  Calculating tf-idf attempts to find the words that are important (i.e., common) in a text, but not too common. Let’s do that now.

```{r}
text_id <- text %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))
```

10 "exclusive" words for each group
```{r}
text_id$word <- as.factor(text_id$word)
text_id %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(10, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()
```


# WORDS - professors only data

```{r}
textP <- text %>% filter(position_cat == "professor")

table(textP$gender)
```

```{r}
par(mfrow=c(1,2))
textplot_wordcloud(x=dfm(tokens(textP$word[textP$gender=="F"])),
                   col="#6D57CF")
par(new=T)
textplot_wordcloud(x=dfm(tokens(textP$word[textP$gender=="M"])),
                   col="#FCA532")
```


Mean number of words by abstract
```{r}
textP %>% count(id,gender) %>%
  ggplot(aes(x=gender, y=n)) + 
  geom_violin() + geom_boxplot(width=0.2)+
  ggbeeswarm::geom_quasirandom(size=3, shape=21) 
```

20 most commmon words
```{r}
textP %>%
  count(word, sort = TRUE) %>% 
  top_n(20,n)%>%
  kable()
```

## Words Frequency by gender

```{r}
propsP <- textP %>%
    count(gender, word) %>%
  group_by(gender) %>%
  mutate(proportion = n / sum(n)) %>% 
  pivot_wider(names_from = gender, values_from = c(proportion,n)) %>%
  mutate(abs.dif.p = abs(proportion_F-proportion_M),
         rel.dif.p = pmax(proportion_F, proportion_M)/
           pmin(proportion_F, proportion_M)) %>%
  arrange(desc(abs.dif.p))
propsP$label <- NA
propsP$label[1:20] <- propsP$word[1:20]
```

```{r}
ggplot(propsP, aes(x=proportion_M,, y=proportion_F,
       color=abs.dif.p)) + 
  geom_abline(color = "gray40", lty = 2) +
  geom_point(size=1, alpha=0.1)+
 # geom_jitter(size=2.5, alpha=0.02)+
  geom_text(aes(label=word), check_overlap = T,vjust=1,hjust=0.32)+
  #geom_text_repel(aes(label=label), size=3.2)+
  #xlim(0,1)+ylim(0,0.2)+
  scale_x_log10(name="Male most used words", limits=c(0.0003,0.02),
                labels = percent_format()) +
  scale_y_log10(name="Female most used words",limits=c(0.0003,0.02),
                labels = percent_format()) +
  scale_color_gradient(name="Absolute \n difference",low = "blue", high = "red",
                       labels=percent_format()
                       )  +
  theme(legend.justification = c(1, -0.1), legend.position = c(1, 0))
 # geom_smooth(method="lm")
ggsave("figures/FIG_S5_wordFreq_Prof.jpg", height = 7, width=9)
```


Words that are close to the dashed  line in these plots have similar frequencies in both genders. Words that are far from the line are words that are found more in one set of texts than another.

Legend: absolute differences in the frequency of the word by males and females.

Labels for the 20 words with largest differences in frequency.

Correlation of word frequeency use between gender:
```{r}
cor.test(propsP$proportion_F, propsP$proportion_M)
```


20 words with the largest differences in frequency

```{r}
propP2 <- propsP %>% filter(!is.na(label)) %>%
  arrange(desc(proportion_F), desc(proportion_M)) %>%
  mutate(ntot = n_F + n_M) %>%
  mutate(word = fct_reorder(word,(ntot),max),
         proportion_F = proportion_F*-1) %>%
  pivot_longer(2:3,names_to = "gender", values_to ="proportion")

ggplot(propP2, aes(x=proportion, y=word,fill=gender)) +
  geom_col()+ ylab("") + xlab("Proportion")+
  scale_fill_manual(name="gender", values=c("#6D57CF","#FCA532"),
                    labels=c("F", "M"))+
   geom_vline(xintercept = c(-0.02,-0.01,0,0.01,0.02),
              linetype="dotted",
             col="darkgray") +
  scale_x_continuous(breaks=c(-0.02,-0.01,0,0.01,0.02),
                     labels = c(0.02,0.01,0,0.01,0.02))
```


## TF IDF

```{r}
text_idP <- textP %>% count(gender, word) %>% 
  bind_tf_idf(word, gender, n) %>%
  arrange(desc(tf_idf))
```

10 "exclusive" words for each group
```{r}
text_idP$word <- as.factor(text_idP$word)
text_idP %>%
  group_by(gender) %>% 
  arrange(desc(tf_idf)) %>% 
  top_n(10, tf_idf) %>%  
  ggplot(aes(x = tf_idf, y = reorder(word, tf_idf), fill = gender)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~gender, scales = "free") +
  theme_minimal()
```

# Topic model - all data

LDA - latent Dirichlet allocation method for fiting topic model

> It treats each document as a mixture of topics, and each topic as a mixture of words. This allows documents to “overlap” each other in terms of content, rather than being separated into discrete groups, in a way that mirrors typical use of natural lan‐ guage.

> Every document is a mixture of topics

> Every topic is a mixture of words


```{r}
matext <- text %>% count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)
```

Choosing number of topics: comparing AIC

```{r}
ap_lda2 <- LDA(matext, k = 2, control = list(seed = 1234))
ap_lda3 <- LDA(matext, k = 3, control = list(seed = 1234))
ap_lda4 <- LDA(matext, k = 4, control = list(seed = 1234))
ap_lda5 <- LDA(matext, k = 5, control = list(seed = 1234))
ap_lda10<- LDA(matext, k = 10, control = list(seed = 1234))
ap_lda20 <- LDA(matext, k = 20, control = list(seed = 1234))
bbmle::AICtab(ap_lda2, ap_lda3, ap_lda4,ap_lda5, ap_lda10,ap_lda20,
              base=T)
```

two-topics model seems the most plausible model

## Word-topic probabilities

10 words with the largest probabilities for each group
```{r}
ap_topics <- tidy(ap_lda2, matrix = "beta")
ap_top_terms <- ap_topics %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_terms %>%
mutate(term = reorder(term, beta)) %>% 
  ggplot(aes(term, beta, fill = factor(topic))) + 
  geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()
```
words with the greates difference in Beta between topics

```{r}
beta_spread <- ap_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
beta_spread
beta_spread %>%  
  arrange(log_ratio) %>% slice(c(1:5,260:264)) %>%
  ggplot(aes(fct_reorder(term,log_ratio,min), log_ratio)) + 
  geom_col(show.legend = FALSE) + coord_flip() +
  ylab("Log2 ration of beta in topic 2/topic 1") + xlab("Word")
```


## Document-topic probabilities - classifying the abstracts

and comparing the two groups by gender (if there is a difference in frequency)

```{r}
ap_documents <- tidy(ap_lda2, matrix = "gamma")
classifi <- ap_documents %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifi$gender, classifi$topic)

classifi %>% tabyl(gender, topic) %>% adorn_percentages() %>%
  adorn_pct_formatting(digits = 0) %>%
  adorn_ns() %>% kable()
```


```{r}
classifi %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~ gender)
```

## Chi-square test 
```{r}
chisq.test(classifi$gender, classifi$topic)
```


# Topic model - Professors only

```{r}
matextP <- textP %>% 
  count(id, gender, word) %>% mutate(id = paste(id, gender, sep="_")) %>%
  select(-gender) %>%
                cast_dtm(term=word,document=id,value=n)
```

```{r}
ap_lda2P <- LDA(matextP, k = 2, control = list(seed = 1234))
ap_lda3P <- LDA(matextP, k = 3, control = list(seed = 1234))
ap_lda4P <- LDA(matextP, k = 4, control = list(seed = 1234))
bbmle::AICtab(ap_lda2P, ap_lda3P, ap_lda4P,base=T)
```

## word-topic probabilities
```{r}
ap_topicsP <- tidy(ap_lda2P, matrix = "beta")
ap_top_termsP <- ap_topicsP %>%
      group_by(topic) %>%
      top_n(10, beta) %>%
      ungroup() %>%
      arrange(topic, -beta)
ap_top_termsP %>%
mutate(term = reorder(term, beta)) %>% ggplot(aes(term, beta, fill = factor(topic))) + geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") + coord_flip()
```
words with the greates difference in Beta between topics

```{r}
beta_spread <- ap_topicsP %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))
beta_spread
beta_spread %>%  
  arrange(log_ratio) %>% slice(c(1:5,260:264)) %>%
  ggplot(aes(fct_reorder(term,log_ratio,min), log_ratio)) + 
  geom_col(show.legend = FALSE) + coord_flip() +
  ylab("Log2 ration of beta in topic 2/topic 1") + xlab("Word")
```

## Document-topic probabilities

```{r}
ap_documentsP <- tidy(ap_lda2P, matrix = "gamma")
classifiP <- ap_documentsP %>% mutate(gender = substr(document, nchar(document), nchar(document))) %>%
  group_by(document,gender) %>%
      top_n(1, gamma) 

table(classifiP$gender, classifiP$topic)

library(janitor)
classifiP %>% tabyl(gender, topic) %>% adorn_percentages() %>% 
  adorn_pct_formatting(digits = 0) %>%
  adorn_ns() %>% kable()
```


```{r}
classifiP %>%
 # mutate(title = reorder(title, gamma * topic)) %>%
  ggplot(aes(as.character(topic), gamma)) +
  geom_boxplot() +
  geom_violin()+
  facet_wrap(~ gender)
```

Chi-square test 
```{r}
chisq.test(classifiP$gender, classifiP$topic)
```