Class 6 Problem Set

Note: Students should always aim to produce publication-worthy tables and figures. Unless otherwise stated, tables should be rendered using stargazer::(), while figures can be rendered using ggplot2::() or plot(). Regardless, tables and figures should always be presented with necessary formatting – e.g., (sub)title, axis (variable) labels and titles, a clearly-identifiable legend and key, etc. Problem sets must always be compiled using LaTex or RMarkdown and include the full coding routine (with notes explaining your implementation) used to complete each problem (10pts).

Produce a series of 10 strings and perform sentiment analysis using BING, AFINN, and Sentimentr (Note: Be sure to produce three, publication-ready tables using stargazer() for each approach) (2pts).

sample_strings <- c("I absolutely love this product! It's amazing.",
                    "This is the worst experience I've ever had.",
                    "I feel indifferent about the movie; it was okay.",
                    "What a fantastic performance by the lead actor!",
                    "I'm very disappointed with the service today.",
                    "The book was quite interesting and informative.",
                    "I hate waiting in long lines; it frustrates me.",
                    "The weather today is beautiful and sunny.",
                    "I'm not sure how I feel about this situation.",
                    "The restaurant exceeded my expectations; great food!")

sample_strings <- tibble(
  doc_id = seq_along(sample_strings),
  text = sample_strings) 

strings_tokens <- sample_strings %>% # Convert to tibble
  tidytext::unnest_tokens(word, text) # Convert to Unnested Tokens

bing <- strings_tokens %>%
  inner_join(tidytext::get_sentiments("bing"), by = "word") %>%
  count(doc_id, sentiment) %>%
  tidyr::pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(BING = case_when(
    .default = 'Negative',
    positive == negative ~ 'Neutral', 
    positive > negative ~ 'Positive')) %>%
    select(doc_id, BING) 

afinn <- strings_tokens %>%
  inner_join(tidytext::get_sentiments("afinn"), by = "word") %>%
  group_by(doc_id) %>%
  summarise(AFINN = sum(value), .groups = "drop") %>%
  select(doc_id, AFINN) 

sentimentr <- sample_strings %>%
    mutate(sentimentR = round(sentimentr::sentiment_by(text)$ave_sentiment, 3)) %>%
  select(doc_id, sentimentR)

combined <- bing %>%
  left_join(afinn, by = 'doc_id') %>%
  left_join(sentimentr, by = 'doc_id') %>%
  left_join(sample_strings, by = 'doc_id') %>%
  rename(Text = text) %>%
  select(Text, BING, AFINN, sentimentR)

stargazer::stargazer(combined, summary = F, type = 'text')

## 
## ================================================================================
##                           Text                           BING   AFINN sentimentR
## --------------------------------------------------------------------------------
## 1    I absolutely love this product! It's amazing.     Positive   7     0.479   
## 2     This is the worst experience I've ever had.      Negative  -3     -0.177  
## 3   I feel indifferent about the movie; it was okay.   Negative  -2     -0.167  
## 4   What a fantastic performance by the lead actor!    Positive   4     0.495   
## 5    I'm very disappointed with the service today.     Negative  -2     -0.68   
## 6   The book was quite interesting and informative.    Positive   2      0.51   
## 7   I hate waiting in long lines; it frustrates me.    Negative  -5     -0.417  
## 8      The weather today is beautiful and sunny.       Positive   3      0.51   
## 9 The restaurant exceeded my expectations; great food! Positive   3     0.491   
## --------------------------------------------------------------------------------

Mosteller & Wallace (1963) considered, among others, the authorship of Federalist 55, 56, 57, 58, 62, and 63. Although the essays were born from three authors – James Madison, Alexander Hamilton, and John Jay – the latter author (Jay) is known to have not authored any beyond those directly attributed to him. Using the remaining essays whose authorship we can attribute to Madison or Hamilton, complete the following tasks:

A. Using only those essays where authorship is not disputed (i.e., authorship is known to be Hamilton, Madison, or Jay individually), reduce the complexity of the text and construct a corpus to recover the most commonly used words for each author. Further, produce both a data frame (or tibble) representing the 10 most common terms with their associated volumes, as well as a word cloud graphic of the 50 most common terms for each author (1pt).

B. With your corpus, construct the same items as (A) but aggregated across both authors – e.g., a data frame and word cloud of the most common terms for both Hamilton and Madison (1pt).

C. Using the multinomial language model approach, employ (B) to construct a vocabulary and associated multinomial probability vectors (\(\mu\)) for each author using at least five unique words (2pts)

D. For each of the listed and disputed Federalist Papers (55, 56, 57, 58, 62, and 63), recover \(\mathbf{W_{Hamilton}}\) and \(\mathbf{W_{Madison}}\) (2pts).

E. Using only Federalist 58, measure the cosine similarity between both \(\mathbf{W}_{Hamilton/Madison}\) and \(\mathbf{W}_{Federalist\hspace{0.5mm}55}\) (2pts).

roman_numbers <- paste0(as.character(as.roman(c(55, 56, 57, 58, 62, 63))), '.')
federalist_essays_disputed <- paste(paste0("No. ", roman_numbers))

fed_papers <- read_html('https://www.gutenberg.org/cache/epub/18/pg18-images.html')
essays <- html_elements(fed_papers, '.chapter')
text <- html_text2(essays)
text <- tibble(text)

Q2 (A)

federalist_essays <- text %>%
  filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
  mutate(author = text %>%
           str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
           str_to_title(),
         title = str_extract(text, 'No. [A-Z].*')) %>%
  filter(author %in% c('Hamilton', 'Madison', 'Jay')) 

federalist_essays$text <- sapply(federalist_essays$text, reduce_complexity)
  
fed_corpus <- quanteda::corpus(federalist_essays$text,
                     docnames = federalist_essays$title,
                     docvars = federalist_essays %>% select(author))

fed_dfm <- quanteda::dfm(quanteda::tokens(fed_corpus))
fed_dfm_by_author <- quanteda::dfm_group(fed_dfm, groups = docvars(fed_corpus, "author"))
top_words_by_author <- data.frame()

for (i in 1:ndoc(fed_dfm_by_author)){
  author_name <- docnames(fed_dfm_by_author)[i]
  words <- topfeatures(fed_dfm_by_author[i, ], n = 10)
  temp <- tibble(author = author_name, word = names(words), frequency = as.numeric(words))
  top_words_by_author <- bind_rows(top_words_by_author, temp)
}

hamilton <- top_words_by_author %>%
  filter(author == 'Hamilton') %>%
  select(-c(author)) %>%
  setNames(c('Word', 'Freq.'))

stargazer::stargazer(hamilton, summary = F, type = 'text')

## 
## =====================
##        Word     Freq.
## ---------------------
## 1     state      954 
## 2      will      711 
## 3      may       698 
## 4     power      521 
## 5   government   498 
## 6      upon      374 
## 7      can       292 
## 8      one       290 
## 9  constitution  284 
## 10    people     271 
## ---------------------

madison <- top_words_by_author %>%
  filter(author == 'Madison') %>%
  select(-c(author)) %>%
  setNames(c('Word', 'Freq.'))

stargazer::stargazer(madison, summary = F, type = 'text')

## 
## =====================
##        Word     Freq.
## ---------------------
## 1     state      443 
## 2   government   306 
## 3      will      287 
## 4     power      274 
## 5      may       239 
## 6  constitution  182 
## 7     people     146 
## 8      one       128 
## 9     great      118 
## 10   federal     111 
## ---------------------

jay <- top_words_by_author %>%
  filter(author == 'Jay') %>%
  select(-c(author)) %>%
  setNames(c('Word', 'Freq.'))

stargazer::stargazer(jay, summary = F, type = 'text')

## 
## ===================
##       Word    Freq.
## -------------------
## 1     will     76  
## 2  government  50  
## 3     good     50  
## 4    people    48  
## 5     one      47  
## 6     may      42  
## 7    nation    40  
## 8    state     39  
## 9     make     33  
## 10   treaty    33  
## -------------------

quanteda.textplots::textplot_wordcloud(fed_dfm_by_author, comparison = TRUE, max_words = 50,
                   color = c("blue", "red", 'orange'))

Q2 (B)

federalist_essays_madison_hamilton <- federalist_essays %>%
  filter(author %in% c('Madison', 'Hamilton'))

fed_corpus <- quanteda::corpus(federalist_essays_madison_hamilton$text,
                     docnames = federalist_essays_madison_hamilton$title,
                     docvars = federalist_essays_madison_hamilton %>% select(author))

fed_dfm <- quanteda::dfm(quanteda::tokens(fed_corpus)) %>%
  quanteda::dfm_trim(min_termfreq = 2) 

words <- topfeatures(fed_dfm, n = 10)

combined <- tibble(author = 'Madison and Hamilton', word = names(words), frequency = as.numeric(words))

stargazer::stargazer(combined, summary = F, type = 'text')

## 
## ==============================================
##           author            word     frequency
## ----------------------------------------------
## 1  Madison and Hamilton    state       1397   
## 2  Madison and Hamilton     will        998   
## 3  Madison and Hamilton     may         937   
## 4  Madison and Hamilton  government     804   
## 5  Madison and Hamilton    power        795   
## 6  Madison and Hamilton constitution    466   
## 7  Madison and Hamilton     one         418   
## 8  Madison and Hamilton    people       417   
## 9  Madison and Hamilton     upon        381   
## 10 Madison and Hamilton    great        375   
## ----------------------------------------------

quanteda.textplots::textplot_wordcloud(fed_dfm, max_words = 50)

Q2 (C)

federalist_essays %>%
  filter(!author == 'Jay') %>%
  tidytext::unnest_tokens(input = 'text', output = 'word') %>%
  filter(word %in% c(combined$word[1:5])) %>%
  count(author, word) %>%          
  tidyr:: pivot_wider(names_from = author,values_from = n, values_fill = 0) %>%
  { 
    wide <- .                      
    bind_rows(
      wide,
      wide %>%
        select(-word) %>%
        summarise(across(everything(), sum)) %>%
        mutate(word = "TOTAL") %>%
        select(word, everything()))
  }

## # A tibble: 6 × 3
##   word       Hamilton Madison
##   <chr>         <int>   <int>
## 1 government      498     306
## 2 may             698     239
## 3 power           521     274
## 4 state           954     443
## 5 will            711     287
## 6 TOTAL          3382    1549

\[\mathbf{W_{Hamilton}} = \text{Multinomial}(3382, \mu_{H})\]

\[\mathbf{W_{Madison}} = \text{Multinomial}(1549, \mu_{M})\]

\[\mu_{Hamilton} = \left( \frac{498}{3382}, \frac{698}{3382}, \frac{521}{3382}, \frac{954}{3382}, \frac{711}{3382} \right)\]

\[\mu_{Hamilton} = (0.147,\,0.206,\,0.154,\,0.282,\,0.210)\]

\[\mu_{Madison} = \left( \frac{306}{1549}, \frac{239}{1549}, \frac{274}{1549}, \frac{443}{1549}, \frac{287}{1549} \right)\]

\[\mu_{Madison} = (0.198,\,0.154,\,0.177,\,0.286,\,0.185)\]

Q2 (D)

temp_essay_vectors <- list()

for (i in 1:length(federalist_essays_disputed)){
  
  temp_disputed_essay <- federalist_essays_disputed[i]
  
  temp_essay  <- text %>%
  filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
  mutate(author = text %>%
           str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
           str_to_title(),
         title = str_extract(text, 'No. [A-Z].*')) %>%
    filter(title == temp_disputed_essay) 
  
  temp_essay$text <- sapply(temp_essay$text, reduce_complexity)
  
  temp_essay <- temp_essay %>%
    tidytext::unnest_tokens(input = 'text', output = 'word') %>%
    mutate(word = factor(word)) %>% 
  count(word, .drop = FALSE) %>%
  setNames(c('word', 'count'))
  
  temp_essay_vector <- rep(0, length = 5)
  names(temp_essay_vector) <- combined$word[1:5]

  for (row in 1:nrow(temp_essay)){
    temp_row <- temp_essay[row,]
    temp_essay_vector[[as.character(temp_row$word)]] <- temp_row$count
  }

  temp_essay_vector <- temp_essay_vector[names(temp_essay_vector) %in% unique(combined$word)[1:5]]
  
  temp_essay_vectors[[as.character(temp_disputed_essay)]] <- temp_essay_vector
  
  } # Get mu_FedPaper for Each Word in Vocab

author_vectors <- list()

for (i in c('Hamilton', 'Madison')){
  
  temp_author_vector <- text %>%
  filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
  mutate(author = text %>%
           str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
           str_to_title(),
         title = str_extract(text, 'No. [A-Z].*')) %>%
  filter(author == i ) %>%
  tidytext::unnest_tokens(input = 'text', output = 'word') %>%
  filter(word %in% c(combined$word[1:5])) %>%
  count(author, word)  
  
  temp_vector <- c(temp_author_vector$n)
  names(temp_vector) <- c(temp_author_vector$word)
  ref_names <- names(temp_essay_vector)
  temp_vector <- temp_vector[ref_names]
  
  author_vectors[[as.character(i)]] <- temp_vector
  
}


temp_likelihoods <- data.frame()

for (i in 1:length(federalist_essays_disputed)){

  temp_essay <- names(temp_essay_vectors)[i]
  temp_hamilton <- dmultinom(x = temp_essay_vectors[[i]],
                                 prob = author_vectors[['Hamilton']] + 1) # W/ Laplace
  temp_madison <- dmultinom(x = temp_essay_vectors[[i]],
                                 prob = author_vectors[['Madison']] + 1) # W/ Laplace

  temp_combined <- data.frame(Essay = temp_essay, 
                              Hamilton = log(temp_hamilton), 
                              Madison = log(temp_madison), 
                              Winner = ifelse(temp_madison > temp_hamilton, 'Madison', 'Hamilton'))
  
  temp_likelihoods <- bind_rows(temp_likelihoods, temp_combined)
    
}

stargazer::stargazer(temp_likelihoods, summary = F, type = 'text')

## 
## ======================================
##     Essay    Hamilton Madison  Winner 
## --------------------------------------
## 1  No. LV.   -12.218  -14.686 Hamilton
## 2  No. LVI.  -31.687  -36.260 Hamilton
## 3 No. LVII.  -18.389  -16.485 Madison 
## 4 No. LVIII. -19.777  -21.334 Hamilton
## 5 No. LXII.  -19.311  -15.295 Madison 
## 6 No. LXIII. -12.222  -11.436 Madison 
## --------------------------------------

Q2 (E)

  temp_essay  <- text %>%
  filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
  mutate(author = text %>%
           str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
           str_to_title(),
         title = str_extract(text, 'No. [A-Z].*')) %>%
    filter(title == 'No. LVIII.') 
  
  temp_essay$text <- sapply(temp_essay$text, reduce_complexity)
  
  temp_essay <- temp_essay %>%
    tidytext::unnest_tokens(input = 'text', output = 'word') %>%
    mutate(word = factor(word)) %>% 
  count(word, .drop = FALSE) %>%
  setNames(c('word', 'count'))
  
  temp_essay_vector <- rep(0, length = 5)
  names(temp_essay_vector) <- combined$word[1:5]

  for (row in 1:nrow(temp_essay)){
    temp_row <- temp_essay[row,]
    temp_essay_vector[[as.character(temp_row$word)]] <- temp_row$count
  }

  temp_essay_vector <- temp_essay_vector[names(temp_essay_vector) %in% unique(combined$word)[1:5]]
  temp_hamilton_vector <- author_vectors[['Hamilton']]
  temp_madison_vector <- author_vectors[['Madison']]
  
  hamilton_inner <- sum(temp_essay_vector * temp_hamilton_vector)
  madison_inner <- sum(temp_essay_vector * temp_madison_vector)
  hamilton_mag <- sqrt(sum(temp_hamilton_vector^2))
  madison_mag <- sqrt(sum(temp_madison_vector^2))
  essay_mag <- sqrt(sum(temp_essay_vector^2))
  
  data.frame(Author = c('Hamilton', 'Madison'), 
                         Cosine = c(hamilton_inner/(hamilton_mag*essay_mag), 
                                    madison_inner/(madison_mag*essay_mag)))

##     Author    Cosine
## 1 Hamilton 0.9251719
## 2  Madison 0.9060235