Note: Students should always aim to produce
publication-worthy tables and figures. Unless otherwise stated,
tables should be rendered using stargazer::(), while
figures can be rendered using ggplot2::() or
plot(). Regardless, tables and figures should always be
presented with necessary formatting – e.g., (sub)title, axis (variable)
labels and titles, a clearly-identifiable legend and key, etc. Problem
sets must always be compiled using LaTex or
RMarkdown and include the full coding routine (with notes
explaining your implementation) used to complete each problem
(10pts).
| Document_ID | Category | Terms |
|---|---|---|
| 1 | Democrat | healthcare, taxes, security |
| 2 | Democrat | climate, energy, security |
| 3 | Democrat | healthcare, equity, border |
| 4 | Democrat | immigration, pathway, energy |
| 5 | Democrat | taxes, diplomacy, defense |
| 6 | Republican | taxes, energy, security |
| 7 | Republican | border, enforcement, energy |
| 8 | Republican | healthcare, market, defense |
| 9 | Republican | taxes, domestic, security |
| 10 | Republican | immigration, enforcement, defense |
A. For two words found in both classes (Democratic and
Republican), calculate the mutual information. There are
seven total, but you only need to choose two.
pi_dem = 5/10 #Unconditional (Dem)
pi_rep = 5/10 # Unconditional (Rep)
hk = -pi_dem * log2(pi_dem) - pi_rep * log2(pi_rep) # Unconditional
# Healthcare
p_present = 3/10 # Healthcare in 3/10 Docs
health_dem = 2/3 # Dem (Docs 1 and 3)
health_rep = 1/3 # Rep (Doc 8)
hk_healthcare = -(health_dem*log2(health_dem)+health_rep*log2(health_rep))
p_absent = 7/10 # Healthcare Missing in 7/10
dem_absent = 3/7
rep_absent = 4/7
hk_healthcare_absent = -(dem_absent*log2(dem_absent)+rep_absent*log2(rep_absent))
hk_conditional = H_y_given_x <- p_present * hk_healthcare +
p_absent * hk_healthcare_absent
mi_healthcare = hk - hk_conditional
mi_healthcare # Mutual Information (Healthcare)
## [1] 0.03485155
# Taxes
p_present = 4/10 # taxes in 4/10 Docs
p_absent = 6/10 # Taxes missing in 6/10
taxes_dem = 2/4 # Dem (Docs 1 and 3)
taxes_rep = 2/4 # Rep (Doc 8)
hk_taxes = -(taxes_dem*log2(taxes_dem)+taxes_rep*log2(taxes_rep))
dem_absent = 3/7
rep_absent = 4/7
hk_taxes_absent = -(taxes_dem * log2(taxes_dem) + taxes_rep * log2(taxes_rep))
hk_conditional <- p_present * hk_taxes + p_absent * hk_taxes_absent
mi_taxes = hk - hk_conditional
mi_taxes # Mutual Information (Healthcare)
## [1] 0
B. For the words Healthcare, Energy, and
Security, use the Fightin’ Words approach to recover the
log odds ratio that each is a predictor of either class.
# Posterior Means = (n[word,class] + alpha_j)/(n[total words in class] + n[unique words]*alpha_j)
# Healthcare
posterior_health_dem = (2 + 0.5)/(15 + 14*0.5) # Posterior for Dem
posterior_health_rep = (1 + 0.5)/(15 + 14*0.5) # Posterior for Rep
log_health_dem = log(posterior_health_dem/(1-posterior_health_dem))
log_health_rep = log(posterior_health_rep/(1-posterior_health_rep))
log_health_dem-log_health_rep # Log Odds Ratio
## [1] 0.560836
exp(log_health_dem-log_health_rep) # Exponentiation Log Odds Ratio
## [1] 1.752137
# Energy
posterior_energy_dem = (2 + 0.5)/(15 + 14*0.5) # Posterior for Dem
posterior_energy_rep = (2 + 0.5)/(15 + 14*0.5) # Posterior for Rep
log_energy_dem = log(posterior_energy_dem/(1-posterior_energy_dem))
log_energy_rep = log(posterior_energy_rep/(1-posterior_energy_rep))
log_energy_dem-log_energy_rep # Log Odds Ratio
## [1] 0
exp(log_energy_dem-log_energy_rep) # Exponentiated Log Odds Ratio
## [1] 1
# Security
posterior_security_dem = (2 + 0.5)/(15 + 14*0.5) # Posterior for Dem
posterior_security_rep = (2 + 0.5)/(15 + 14*0.5) # Posterior for Rep
log_security_dem = log(posterior_security_dem/(1-posterior_security_dem))
log_security_rep = log(posterior_security_rep/(1-posterior_security_rep))
log_security_dem-log_security_rep # Log Odds Ratio
## [1] 0
exp(log_security_dem-log_security_rep) # Exponentiated Log Odds Ratio
## [1] 1
fed_papers <- read_html('https://www.gutenberg.org/cache/epub/18/pg18-images.html')
essays <- html_elements(fed_papers, '.chapter')
text <- html_text2(essays)
text <- tibble(text)
federalist <- text %>%
filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
mutate(author = text %>%
str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
str_to_title(),
title = str_extract(text, 'No. [A-Z].*')) %>%
filter(str_detect(author, "Madison")) # Clean & Partition
corpus <- VCorpus(VectorSource(sapply(federalist$text, reduce_complexity)))
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf)) # TF-IDF DTM
dtm <- removeSparseTerms(dtm, 0.95) # (Optional) Reducing Further Sparsity
dtm_matrix <- as.matrix(dtm) # Convert DTM Matrix for K-Means
cluster_centers <- seq(5, 25, 5)
k_means_output <- data.frame()
for (cluster in 1:length(cluster_centers)){
temp_k_means <- kmeans(dtm_matrix,
centers = cluster_centers[cluster],
nstart = 25)
k_means_output <- bind_rows(k_means_output,
data.frame(cluster_center = cluster_centers[cluster],
sum_of_squares = temp_k_means$tot.withinss))
}
k_means_output$reduction <- c(NA, diff(k_means_output$sum_of_squares) * -1)
colnames(k_means_output) <- c("Cluster Center", "Sum of Squares", "Reduction")
stargazer::stargazer(k_means_output,
summary=F,
type = 'text')
##
## =========================================
## Cluster Center Sum of Squares Reduction
## -----------------------------------------
## 1 5 0.101
## 2 10 0.068 0.033
## 3 15 0.045 0.023
## 4 20 0.026 0.018
## 5 25 0.010 0.016
## -----------------------------------------
mean(k_means_output$Reduction[2:3], na.rm = TRUE) # 5 to 15
## [1] 0.0280525
mean(k_means_output$Reduction[3:5], na.rm = TRUE) # 15 to 25
## [1] 0.01902038
A. For each topic, report the posterior probabilities for the top 10 documents (words) in each.
B. For at least 5 of the topics, write 2-3 sentences explaining what you think the “topic” identified by the model is – i.e., what do these words tell you about the substantive content of the pooled words? (3pt)
corpus <- VCorpus(VectorSource(sapply(federalist$text, reduce_complexity)))
dtm <- DocumentTermMatrix(corpus) # TF-IDF DTM
lda_federalist <- LDA(dtm, k = 10, method = "Gibbs", control = list(seed = 1234)) # LDA w/ 5 Topics & Gibbs Sampling
topic_probs <- posterior(lda_federalist)$topics
print(round(topic_probs[c(1:10), c(1:5)], 2))
## 1 2 3 4 5
## 1 0.02 0.04 0.01 0.09 0.24
## 2 0.18 0.07 0.08 0.08 0.28
## 3 0.08 0.02 0.51 0.02 0.07
## 4 0.08 0.02 0.45 0.06 0.11
## 5 0.12 0.06 0.29 0.08 0.13
## 6 0.04 0.24 0.03 0.03 0.29
## 7 0.13 0.02 0.06 0.04 0.21
## 8 0.01 0.01 0.01 0.05 0.28
## 9 0.04 0.02 0.02 0.02 0.21
## 10 0.33 0.02 0.03 0.07 0.24
topicmodels::terms(lda_federalist, 10) # 10 Terms
## Topic 1 Topic 2 Topic 3 Topic 4
## [1,] "general" "law" "member" "representative"
## [2,] "power" "election" "city" "will"
## [3,] "establishment" "within" "confederacy" "people"
## [4,] "force" "period" "war" "numb"
## [5,] "long" "circumstance" "empire" "time"
## [6,] "liberty" "knowledge" "among" "house"
## [7,] "necessary" "affair" "sovereign" "every"
## [8,] "army" "legislation" "union" "can"
## [9,] "necessity" "remark" "province" "elect"
## [10,] "dangerous" "practice" "foreign" "branch"
## Topic 5 Topic 6 Topic 7 Topic 8 Topic 9
## [1,] "state" "state" "interest" "convention" "department"
## [2,] "will" "power" "party" "congress" "executive"
## [3,] "may" "authority" "great" "power" "power"
## [4,] "government" "article" "right" "make" "legislative"
## [5,] "federal" "law" "find" "new" "constitution"
## [6,] "must" "shall" "public" "propose" "judiciary"
## [7,] "little" "constitution" "citizen" "confederation" "member"
## [8,] "one" "foreign" "man" "effect" "one"
## [9,] "good" "public" "two" "purpose" "constitutional"
## [10,] "case" "general" "nature" "constitution" "branch"
## Topic 10
## [1,] "people"
## [2,] "government"
## [3,] "national"
## [4,] "body"
## [5,] "senate"
## [6,] "character"
## [7,] "first"
## [8,] "authority"
## [9,] "example"
## [10,] "derive"