Note: Students should always aim to produce
publication-worthy tables and figures. Unless otherwise stated,
tables should be rendered using stargazer::(), while
figures can be rendered using ggplot2::() or
plot(). Regardless, tables and figures should always be
presented with necessary formatting – e.g., (sub)title, axis (variable)
labels and titles, a clearly-identifiable legend and key, etc. Problem
sets must always be compiled using LaTex or
RMarkdown and include the full coding routine (with notes
explaining your implementation) used to complete each problem
(10pts).
stargazer() for each
approach) (2pts).sample_strings <- c("I absolutely love this product! It's amazing.",
"This is the worst experience I've ever had.",
"I feel indifferent about the movie; it was okay.",
"What a fantastic performance by the lead actor!",
"I'm very disappointed with the service today.",
"The book was quite interesting and informative.",
"I hate waiting in long lines; it frustrates me.",
"The weather today is beautiful and sunny.",
"I'm not sure how I feel about this situation.",
"The restaurant exceeded my expectations; great food!")
sample_strings <- tibble(
doc_id = seq_along(sample_strings),
text = sample_strings)
strings_tokens <- sample_strings %>% # Convert to tibble
tidytext::unnest_tokens(word, text) # Convert to Unnested Tokens
bing <- strings_tokens %>%
inner_join(tidytext::get_sentiments("bing"), by = "word") %>%
count(doc_id, sentiment) %>%
tidyr::pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(BING = case_when(
.default = 'Negative',
positive == negative ~ 'Neutral',
positive > negative ~ 'Positive')) %>%
select(doc_id, BING)
afinn <- strings_tokens %>%
inner_join(tidytext::get_sentiments("afinn"), by = "word") %>%
group_by(doc_id) %>%
summarise(AFINN = sum(value), .groups = "drop") %>%
select(doc_id, AFINN)
sentimentr <- sample_strings %>%
mutate(sentimentR = round(sentimentr::sentiment_by(text)$ave_sentiment, 3)) %>%
select(doc_id, sentimentR)
combined <- bing %>%
left_join(afinn, by = 'doc_id') %>%
left_join(sentimentr, by = 'doc_id') %>%
left_join(sample_strings, by = 'doc_id') %>%
rename(Text = text) %>%
select(Text, BING, AFINN, sentimentR)
stargazer::stargazer(combined, summary = F, type = 'text')
##
## ================================================================================
## Text BING AFINN sentimentR
## --------------------------------------------------------------------------------
## 1 I absolutely love this product! It's amazing. Positive 7 0.479
## 2 This is the worst experience I've ever had. Negative -3 -0.177
## 3 I feel indifferent about the movie; it was okay. Negative -2 -0.167
## 4 What a fantastic performance by the lead actor! Positive 4 0.495
## 5 I'm very disappointed with the service today. Negative -2 -0.68
## 6 The book was quite interesting and informative. Positive 2 0.51
## 7 I hate waiting in long lines; it frustrates me. Negative -5 -0.417
## 8 The weather today is beautiful and sunny. Positive 3 0.51
## 9 The restaurant exceeded my expectations; great food! Positive 3 0.491
## --------------------------------------------------------------------------------
Mosteller & Wallace (1963) considered, among others, the authorship of Federalist 55, 56, 57, 58, 62, and 63. Although the essays were born from three authors – James Madison, Alexander Hamilton, and John Jay – the latter author (Jay) is known to have not authored any beyond those directly attributed to him. Using the remaining essays whose authorship we can attribute to Madison or Hamilton, complete the following tasks:
A. Using only those essays where authorship is not disputed (i.e., authorship is known to be Hamilton, Madison, or Jay individually), reduce the complexity of the text and construct a corpus to recover the most commonly used words for each author. Further, produce both a data frame (or tibble) representing the 10 most common terms with their associated volumes, as well as a word cloud graphic of the 50 most common terms for each author (1pt).
B. With your corpus, construct the same items as (A) but aggregated across both authors – e.g., a data frame and word cloud of the most common terms for both Hamilton and Madison (1pt).
C. Using the multinomial language model approach, employ (B) to construct a vocabulary and associated multinomial probability vectors (\(\mu\)) for each author using at least five unique words (2pts)
D. For each of the listed and disputed Federalist Papers (55, 56, 57, 58, 62, and 63), recover \(\mathbf{W_{Hamilton}}\) and \(\mathbf{W_{Madison}}\) (2pts).
E. Using only Federalist 58, measure the cosine similarity between both \(\mathbf{W}_{Hamilton/Madison}\) and \(\mathbf{W}_{Federalist\hspace{0.5mm}55}\) (2pts).
roman_numbers <- paste0(as.character(as.roman(c(55, 56, 57, 58, 62, 63))), '.')
federalist_essays_disputed <- paste(paste0("No. ", roman_numbers))
fed_papers <- read_html('https://www.gutenberg.org/cache/epub/18/pg18-images.html')
essays <- html_elements(fed_papers, '.chapter')
text <- html_text2(essays)
text <- tibble(text)
federalist_essays <- text %>%
filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
mutate(author = text %>%
str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
str_to_title(),
title = str_extract(text, 'No. [A-Z].*')) %>%
filter(author %in% c('Hamilton', 'Madison', 'Jay'))
federalist_essays$text <- sapply(federalist_essays$text, reduce_complexity)
fed_corpus <- quanteda::corpus(federalist_essays$text,
docnames = federalist_essays$title,
docvars = federalist_essays %>% select(author))
fed_dfm <- quanteda::dfm(quanteda::tokens(fed_corpus))
fed_dfm_by_author <- quanteda::dfm_group(fed_dfm, groups = docvars(fed_corpus, "author"))
top_words_by_author <- data.frame()
for (i in 1:ndoc(fed_dfm_by_author)){
author_name <- docnames(fed_dfm_by_author)[i]
words <- topfeatures(fed_dfm_by_author[i, ], n = 10)
temp <- tibble(author = author_name, word = names(words), frequency = as.numeric(words))
top_words_by_author <- bind_rows(top_words_by_author, temp)
}
hamilton <- top_words_by_author %>%
filter(author == 'Hamilton') %>%
select(-c(author)) %>%
setNames(c('Word', 'Freq.'))
stargazer::stargazer(hamilton, summary = F, type = 'text')
##
## =====================
## Word Freq.
## ---------------------
## 1 state 954
## 2 will 711
## 3 may 698
## 4 power 521
## 5 government 498
## 6 upon 374
## 7 can 292
## 8 one 290
## 9 constitution 284
## 10 people 271
## ---------------------
madison <- top_words_by_author %>%
filter(author == 'Madison') %>%
select(-c(author)) %>%
setNames(c('Word', 'Freq.'))
stargazer::stargazer(madison, summary = F, type = 'text')
##
## =====================
## Word Freq.
## ---------------------
## 1 state 443
## 2 government 306
## 3 will 287
## 4 power 274
## 5 may 239
## 6 constitution 182
## 7 people 146
## 8 one 128
## 9 great 118
## 10 federal 111
## ---------------------
jay <- top_words_by_author %>%
filter(author == 'Jay') %>%
select(-c(author)) %>%
setNames(c('Word', 'Freq.'))
stargazer::stargazer(jay, summary = F, type = 'text')
##
## ===================
## Word Freq.
## -------------------
## 1 will 76
## 2 government 50
## 3 good 50
## 4 people 48
## 5 one 47
## 6 may 42
## 7 nation 40
## 8 state 39
## 9 make 33
## 10 treaty 33
## -------------------
quanteda.textplots::textplot_wordcloud(fed_dfm_by_author, comparison = TRUE, max_words = 50,
color = c("blue", "red", 'orange'))
federalist_essays_madison_hamilton <- federalist_essays %>%
filter(author %in% c('Madison', 'Hamilton'))
fed_corpus <- quanteda::corpus(federalist_essays_madison_hamilton$text,
docnames = federalist_essays_madison_hamilton$title,
docvars = federalist_essays_madison_hamilton %>% select(author))
fed_dfm <- quanteda::dfm(quanteda::tokens(fed_corpus)) %>%
quanteda::dfm_trim(min_termfreq = 2)
words <- topfeatures(fed_dfm, n = 10)
combined <- tibble(author = 'Madison and Hamilton', word = names(words), frequency = as.numeric(words))
stargazer::stargazer(combined, summary = F, type = 'text')
##
## ==============================================
## author word frequency
## ----------------------------------------------
## 1 Madison and Hamilton state 1397
## 2 Madison and Hamilton will 998
## 3 Madison and Hamilton may 937
## 4 Madison and Hamilton government 804
## 5 Madison and Hamilton power 795
## 6 Madison and Hamilton constitution 466
## 7 Madison and Hamilton one 418
## 8 Madison and Hamilton people 417
## 9 Madison and Hamilton upon 381
## 10 Madison and Hamilton great 375
## ----------------------------------------------
quanteda.textplots::textplot_wordcloud(fed_dfm, max_words = 50)
federalist_essays %>%
filter(!author == 'Jay') %>%
tidytext::unnest_tokens(input = 'text', output = 'word') %>%
filter(word %in% c(combined$word[1:5])) %>%
count(author, word) %>%
tidyr:: pivot_wider(names_from = author,values_from = n, values_fill = 0) %>%
{
wide <- .
bind_rows(
wide,
wide %>%
select(-word) %>%
summarise(across(everything(), sum)) %>%
mutate(word = "TOTAL") %>%
select(word, everything()))
}
## # A tibble: 6 × 3
## word Hamilton Madison
## <chr> <int> <int>
## 1 government 498 306
## 2 may 698 239
## 3 power 521 274
## 4 state 954 443
## 5 will 711 287
## 6 TOTAL 3382 1549
\[\mathbf{W_{Hamilton}} = \text{Multinomial}(3382, \mu_{H})\]
\[\mathbf{W_{Madison}} = \text{Multinomial}(1549, \mu_{M})\]
\[\mu_{Hamilton} = \left( \frac{498}{3382}, \frac{698}{3382}, \frac{521}{3382}, \frac{954}{3382}, \frac{711}{3382} \right)\]
\[\mu_{Hamilton} = (0.147,\,0.206,\,0.154,\,0.282,\,0.210)\]
\[\mu_{Madison} = \left( \frac{306}{1549}, \frac{239}{1549}, \frac{274}{1549}, \frac{443}{1549}, \frac{287}{1549} \right)\]
\[\mu_{Madison} = (0.198,\,0.154,\,0.177,\,0.286,\,0.185)\]
temp_essay_vectors <- list()
for (i in 1:length(federalist_essays_disputed)){
temp_disputed_essay <- federalist_essays_disputed[i]
temp_essay <- text %>%
filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
mutate(author = text %>%
str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
str_to_title(),
title = str_extract(text, 'No. [A-Z].*')) %>%
filter(title == temp_disputed_essay)
temp_essay$text <- sapply(temp_essay$text, reduce_complexity)
temp_essay <- temp_essay %>%
tidytext::unnest_tokens(input = 'text', output = 'word') %>%
mutate(word = factor(word)) %>%
count(word, .drop = FALSE) %>%
setNames(c('word', 'count'))
temp_essay_vector <- rep(0, length = 5)
names(temp_essay_vector) <- combined$word[1:5]
for (row in 1:nrow(temp_essay)){
temp_row <- temp_essay[row,]
temp_essay_vector[[as.character(temp_row$word)]] <- temp_row$count
}
temp_essay_vector <- temp_essay_vector[names(temp_essay_vector) %in% unique(combined$word)[1:5]]
temp_essay_vectors[[as.character(temp_disputed_essay)]] <- temp_essay_vector
} # Get mu_FedPaper for Each Word in Vocab
author_vectors <- list()
for (i in c('Hamilton', 'Madison')){
temp_author_vector <- text %>%
filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
mutate(author = text %>%
str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
str_to_title(),
title = str_extract(text, 'No. [A-Z].*')) %>%
filter(author == i ) %>%
tidytext::unnest_tokens(input = 'text', output = 'word') %>%
filter(word %in% c(combined$word[1:5])) %>%
count(author, word)
temp_vector <- c(temp_author_vector$n)
names(temp_vector) <- c(temp_author_vector$word)
ref_names <- names(temp_essay_vector)
temp_vector <- temp_vector[ref_names]
author_vectors[[as.character(i)]] <- temp_vector
}
temp_likelihoods <- data.frame()
for (i in 1:length(federalist_essays_disputed)){
temp_essay <- names(temp_essay_vectors)[i]
temp_hamilton <- dmultinom(x = temp_essay_vectors[[i]],
prob = author_vectors[['Hamilton']] + 1) # W/ Laplace
temp_madison <- dmultinom(x = temp_essay_vectors[[i]],
prob = author_vectors[['Madison']] + 1) # W/ Laplace
temp_combined <- data.frame(Essay = temp_essay,
Hamilton = log(temp_hamilton),
Madison = log(temp_madison),
Winner = ifelse(temp_madison > temp_hamilton, 'Madison', 'Hamilton'))
temp_likelihoods <- bind_rows(temp_likelihoods, temp_combined)
}
stargazer::stargazer(temp_likelihoods, summary = F, type = 'text')
##
## ======================================
## Essay Hamilton Madison Winner
## --------------------------------------
## 1 No. LV. -12.218 -14.686 Hamilton
## 2 No. LVI. -31.687 -36.260 Hamilton
## 3 No. LVII. -18.389 -16.485 Madison
## 4 No. LVIII. -19.777 -21.334 Hamilton
## 5 No. LXII. -19.311 -15.295 Madison
## 6 No. LXIII. -12.222 -11.436 Madison
## --------------------------------------
temp_essay <- text %>%
filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
mutate(author = text %>%
str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
str_to_title(),
title = str_extract(text, 'No. [A-Z].*')) %>%
filter(title == 'No. LVIII.')
temp_essay$text <- sapply(temp_essay$text, reduce_complexity)
temp_essay <- temp_essay %>%
tidytext::unnest_tokens(input = 'text', output = 'word') %>%
mutate(word = factor(word)) %>%
count(word, .drop = FALSE) %>%
setNames(c('word', 'count'))
temp_essay_vector <- rep(0, length = 5)
names(temp_essay_vector) <- combined$word[1:5]
for (row in 1:nrow(temp_essay)){
temp_row <- temp_essay[row,]
temp_essay_vector[[as.character(temp_row$word)]] <- temp_row$count
}
temp_essay_vector <- temp_essay_vector[names(temp_essay_vector) %in% unique(combined$word)[1:5]]
temp_hamilton_vector <- author_vectors[['Hamilton']]
temp_madison_vector <- author_vectors[['Madison']]
hamilton_inner <- sum(temp_essay_vector * temp_hamilton_vector)
madison_inner <- sum(temp_essay_vector * temp_madison_vector)
hamilton_mag <- sqrt(sum(temp_hamilton_vector^2))
madison_mag <- sqrt(sum(temp_madison_vector^2))
essay_mag <- sqrt(sum(temp_essay_vector^2))
data.frame(Author = c('Hamilton', 'Madison'),
Cosine = c(hamilton_inner/(hamilton_mag*essay_mag),
madison_inner/(madison_mag*essay_mag)))
## Author Cosine
## 1 Hamilton 0.9251719
## 2 Madison 0.9060235