Class 8 Problem Set

Note: Students should always aim to produce publication-worthy tables and figures. Unless otherwise stated, tables should be rendered using stargazer::(), while figures can be rendered using ggplot2::() or plot(). Regardless, tables and figures should always be presented with necessary formatting – e.g., (sub)title, axis (variable) labels and titles, a clearly-identifiable legend and key, etc. Problem sets must always be compiled using LaTex or RMarkdown and include the full coding routine (with notes explaining your implementation) used to complete each problem (10pts).

The sample data below represent terms recovered from American party platforms, where each row represents a party and terms from similar policy areas (4pt).

Document_ID	Category	Terms
1	Democrat	healthcare, taxes, security
2	Democrat	climate, energy, security
3	Democrat	healthcare, equity, border
4	Democrat	immigration, pathway, energy
5	Democrat	taxes, diplomacy, defense
6	Republican	taxes, energy, security
7	Republican	border, enforcement, energy
8	Republican	healthcare, market, defense
9	Republican	taxes, domestic, security
10	Republican	immigration, enforcement, defense

A. For two words found in both classes (Democratic and Republican), calculate the mutual information. There are seven total, but you only need to choose two.

pi_dem = 5/10 #Unconditional (Dem)
pi_rep = 5/10 # Unconditional (Rep)
hk = -pi_dem * log2(pi_dem) - pi_rep * log2(pi_rep) # Unconditional

# Healthcare

p_present = 3/10 # Healthcare in 3/10 Docs
health_dem = 2/3 # Dem (Docs 1 and 3)
health_rep = 1/3 # Rep (Doc 8)
hk_healthcare = -(health_dem*log2(health_dem)+health_rep*log2(health_rep))

p_absent = 7/10 # Healthcare Missing in 7/10
dem_absent = 3/7
rep_absent = 4/7
hk_healthcare_absent = -(dem_absent*log2(dem_absent)+rep_absent*log2(rep_absent))

hk_conditional = H_y_given_x <- p_present * hk_healthcare +
               p_absent  * hk_healthcare_absent

mi_healthcare = hk - hk_conditional

mi_healthcare # Mutual Information (Healthcare)

## [1] 0.03485155

# Taxes

p_present = 4/10 # taxes in 4/10 Docs
p_absent = 6/10 # Taxes missing in 6/10
taxes_dem = 2/4 # Dem (Docs 1 and 3)
taxes_rep = 2/4 # Rep (Doc 8)
hk_taxes = -(taxes_dem*log2(taxes_dem)+taxes_rep*log2(taxes_rep))

dem_absent = 3/7
rep_absent = 4/7
hk_taxes_absent = -(taxes_dem * log2(taxes_dem) + taxes_rep * log2(taxes_rep))

hk_conditional <- p_present * hk_taxes + p_absent * hk_taxes_absent

mi_taxes = hk - hk_conditional

mi_taxes # Mutual Information (Healthcare)

## [1] 0

B. For the words Healthcare, Energy, and Security, use the Fightin’ Words approach to recover the log odds ratio that each is a predictor of either class.

# Posterior Means = (n[word,class] + alpha_j)/(n[total words in class] + n[unique words]*alpha_j)
 

# Healthcare

posterior_health_dem = (2 + 0.5)/(15 + 14*0.5) # Posterior for Dem
posterior_health_rep = (1 + 0.5)/(15 + 14*0.5) # Posterior for Rep
log_health_dem = log(posterior_health_dem/(1-posterior_health_dem))
log_health_rep = log(posterior_health_rep/(1-posterior_health_rep))

log_health_dem-log_health_rep # Log Odds Ratio

## [1] 0.560836

exp(log_health_dem-log_health_rep) # Exponentiation Log Odds Ratio

## [1] 1.752137

# Energy

posterior_energy_dem = (2 + 0.5)/(15 + 14*0.5) # Posterior for Dem
posterior_energy_rep = (2 + 0.5)/(15 + 14*0.5) # Posterior for Rep
log_energy_dem = log(posterior_energy_dem/(1-posterior_energy_dem))
log_energy_rep = log(posterior_energy_rep/(1-posterior_energy_rep))

log_energy_dem-log_energy_rep # Log Odds Ratio

## [1] 0

exp(log_energy_dem-log_energy_rep) # Exponentiated Log Odds Ratio

## [1] 1

# Security

posterior_security_dem = (2 + 0.5)/(15 + 14*0.5) # Posterior for Dem
posterior_security_rep = (2 + 0.5)/(15 + 14*0.5) # Posterior for Rep
log_security_dem = log(posterior_security_dem/(1-posterior_security_dem))
log_security_rep = log(posterior_security_rep/(1-posterior_security_rep))

log_security_dem-log_security_rep # Log Odds Ratio

## [1] 0

exp(log_security_dem-log_security_rep) # Exponentiated Log Odds Ratio

## [1] 1

Using the Federalist Papers authored by James Madison (no co-authors or disputed authors), apply TF-IDF weighting and run k-means clustering using 5, 10, 15, 20, 25 cluster centers. Report the within-cluster sum of squares for each. What is the average reduction in the sum of squares for the addition of each cluster center? Is the answer the same if I ask this question for bins 5-15 versus 15-25? (3pt)

fed_papers <- read_html('https://www.gutenberg.org/cache/epub/18/pg18-images.html')
essays <- html_elements(fed_papers, '.chapter')
text <- html_text2(essays)
text <- tibble(text)
federalist <- text %>%
  filter(stringr::str_detect(text, 'slightly different version', negate = TRUE)) %>%
  mutate(author = text %>%
           str_extract('HAMILTON AND MADISON|HAMILTON OR MADISON|HAMILTON|MADISON|JAY') %>%
           str_to_title(),
         title = str_extract(text, 'No. [A-Z].*')) %>%
  filter(str_detect(author, "Madison")) # Clean & Partition

corpus <- VCorpus(VectorSource(sapply(federalist$text, reduce_complexity)))
dtm <- DocumentTermMatrix(corpus, control = list(weighting = weightTfIdf)) # TF-IDF DTM
dtm <- removeSparseTerms(dtm, 0.95)  # (Optional) Reducing Further Sparsity
dtm_matrix <- as.matrix(dtm) # Convert DTM Matrix for K-Means
cluster_centers <- seq(5, 25, 5)
k_means_output <- data.frame()

for (cluster in 1:length(cluster_centers)){
  temp_k_means <- kmeans(dtm_matrix, 
                         centers = cluster_centers[cluster], 
                         nstart = 25)
  k_means_output <- bind_rows(k_means_output, 
                              data.frame(cluster_center = cluster_centers[cluster], 
                                         sum_of_squares = temp_k_means$tot.withinss))

  }


k_means_output$reduction <- c(NA, diff(k_means_output$sum_of_squares) * -1)
colnames(k_means_output) <- c("Cluster Center", "Sum of Squares", "Reduction")

stargazer::stargazer(k_means_output, 
                     summary=F, 
                     type = 'text')

## 
## =========================================
##   Cluster Center Sum of Squares Reduction
## -----------------------------------------
## 1       5            0.101               
## 2       10           0.068        0.033  
## 3       15           0.045        0.023  
## 4       20           0.026        0.018  
## 5       25           0.010        0.016  
## -----------------------------------------

mean(k_means_output$Reduction[2:3], na.rm = TRUE)  # 5 to 15

## [1] 0.0280525

mean(k_means_output$Reduction[3:5], na.rm = TRUE)  # 15 to 25

## [1] 0.01902038

Using the same Federalist Papers data (Madison authorship only), construct an LDA topic model with 10 topics. Print the top-10 words associated with each of those topics.

A. For each topic, report the posterior probabilities for the top 10 documents (words) in each.

B. For at least 5 of the topics, write 2-3 sentences explaining what you think the “topic” identified by the model is – i.e., what do these words tell you about the substantive content of the pooled words? (3pt)

corpus <- VCorpus(VectorSource(sapply(federalist$text, reduce_complexity)))
dtm <- DocumentTermMatrix(corpus) # TF-IDF DTM

lda_federalist <- LDA(dtm, k = 10, method = "Gibbs", control = list(seed = 1234)) # LDA w/ 5 Topics & Gibbs Sampling 


topic_probs <- posterior(lda_federalist)$topics
print(round(topic_probs[c(1:10), c(1:5)], 2))

##       1    2    3    4    5
## 1  0.02 0.04 0.01 0.09 0.24
## 2  0.18 0.07 0.08 0.08 0.28
## 3  0.08 0.02 0.51 0.02 0.07
## 4  0.08 0.02 0.45 0.06 0.11
## 5  0.12 0.06 0.29 0.08 0.13
## 6  0.04 0.24 0.03 0.03 0.29
## 7  0.13 0.02 0.06 0.04 0.21
## 8  0.01 0.01 0.01 0.05 0.28
## 9  0.04 0.02 0.02 0.02 0.21
## 10 0.33 0.02 0.03 0.07 0.24

topicmodels::terms(lda_federalist, 10) # 10 Terms

##       Topic 1         Topic 2        Topic 3       Topic 4         
##  [1,] "general"       "law"          "member"      "representative"
##  [2,] "power"         "election"     "city"        "will"          
##  [3,] "establishment" "within"       "confederacy" "people"        
##  [4,] "force"         "period"       "war"         "numb"          
##  [5,] "long"          "circumstance" "empire"      "time"          
##  [6,] "liberty"       "knowledge"    "among"       "house"         
##  [7,] "necessary"     "affair"       "sovereign"   "every"         
##  [8,] "army"          "legislation"  "union"       "can"           
##  [9,] "necessity"     "remark"       "province"    "elect"         
## [10,] "dangerous"     "practice"     "foreign"     "branch"        
##       Topic 5      Topic 6        Topic 7    Topic 8         Topic 9         
##  [1,] "state"      "state"        "interest" "convention"    "department"    
##  [2,] "will"       "power"        "party"    "congress"      "executive"     
##  [3,] "may"        "authority"    "great"    "power"         "power"         
##  [4,] "government" "article"      "right"    "make"          "legislative"   
##  [5,] "federal"    "law"          "find"     "new"           "constitution"  
##  [6,] "must"       "shall"        "public"   "propose"       "judiciary"     
##  [7,] "little"     "constitution" "citizen"  "confederation" "member"        
##  [8,] "one"        "foreign"      "man"      "effect"        "one"           
##  [9,] "good"       "public"       "two"      "purpose"       "constitutional"
## [10,] "case"       "general"      "nature"   "constitution"  "branch"        
##       Topic 10    
##  [1,] "people"    
##  [2,] "government"
##  [3,] "national"  
##  [4,] "body"      
##  [5,] "senate"    
##  [6,] "character" 
##  [7,] "first"     
##  [8,] "authority" 
##  [9,] "example"   
## [10,] "derive"

Class 8 Problem Set

POS6933: Computational Social Science

Truscott (Spring 2026)