Note: Students should always aim to produce publication-worthy tables and figures. Unless otherwise stated, tables should be rendered using stargazer::(), while figures can be rendered using ggplot2::() or plot(). Regardless, tables and figures should always be presented with necessary formatting – e.g., (sub)title, axis (variable) labels and titles, a clearly-identifiable legend and key, etc. Problem sets must always be compiled using LaTex or RMarkdown and include the full coding routine (with notes explaining your implementation) used to complete each problem (10pts).


  1. Using the sotu() library, recover the State of the Union addresses for Presidents Clinton and (W.) Bush between 1993-2008. Complete the following tasks:

    1. Select a policy sphere (e.g., Military, Economy, etc.). Using a for loop, select terms that you believe represent that policy sphere and partition the speeches. In some form, provide a table that prints the number of sentences recovered for each speech (2pts)

    2. Construct a function that reduces the complexity of your texts – including, but not limited to: lemmatizing and removing stopwords, punctuation, and numerals, etc.. Afterwards, use a for loop to apply the function. Convert to a corpus DFM then print the top-20 most frequent words shared among the administrations’ speeches (2pts)

    3. Construct a word cloud graphic using textplot_wordcloud() (or similar) (1pt).

# 1.A

sotu_info <-  sotu::sotu_meta %>%
  filter(president %in% c('William J. Clinton', 'George W. Bush')) # Get Info for Eisenhower and H.W. 
indices <- c(sotu_info$X) # Indices to Partition sotu_text 

sotu_clinton_bush <- setNames(
  lapply(seq_len(nrow(sotu_info)), function(i) {
    cbind(sotu_info[i, ], text = sotu::sotu_text[[indices[i]]])
  }),
  paste0(sotu_info$president, " (", sotu_info$year, ")")
) # Nest Each Speech in List

economy_words_regex <- paste0('(', paste(c('trade', 'recession', 'inflation', 'productivity', 'efficiency', 'markets', 'trade', 'labor', 'industry', 'output'), collapse = '|'), ')')  # "Economy" Words Regex 

for (speech in 1:length(sotu_clinton_bush)){
  temp_speech <- sotu_clinton_bush[[speech]]
  temp_speech <- data.frame(stringr::str_split(temp_speech$text, pattern = '\\n')) %>%
  setNames('text') %>%
  filter(!text == '') # Grab Speech -- Partition to Sentences 
  
  sotu_clinton_bush[[speech]]$text <- list(temp_speech) # Append Back to Original

  
  economy_sentences <- temp_speech %>%
    filter(grepl(economy_words_regex, text, ignore.case = T)) # All Sentences w/ "economy" Words
  
  sotu_clinton_bush[[speech]]$economy_text <- list(economy_sentences) # Append
  
} # Process Speeches & Isolate "economy" Sentences

for (speech in 1:length(sotu_clinton_bush)){
  temp_speech_name <- names(sotu_clinton_bush[speech])
  economy_sentences <- length(unlist(sotu_clinton_bush[[speech]]$economy_text))
  cat(temp_speech_name, ' -- ', economy_sentences, ' Sentences \n')
} # Prints # of "economy" Sentences Per Speech
## William J. Clinton (1993)  --  10  Sentences 
## William J. Clinton (1994)  --  6  Sentences 
## William J. Clinton (1995)  --  6  Sentences 
## William J. Clinton (1996)  --  5  Sentences 
## William J. Clinton (1997)  --  6  Sentences 
## William J. Clinton (1998)  --  10  Sentences 
## William J. Clinton (1999)  --  12  Sentences 
## William J. Clinton (2000)  --  12  Sentences 
## George W. Bush (2001)  --  4  Sentences 
## George W. Bush (2002)  --  6  Sentences 
## George W. Bush (2003)  --  5  Sentences 
## George W. Bush (2004)  --  5  Sentences 
## George W. Bush (2005)  --  2  Sentences 
## George W. Bush (2006)  --  3  Sentences 
## George W. Bush (2007)  --  2  Sentences 
## George W. Bush (2008)  --  4  Sentences
# 1.B

economy_speeches <- data.frame()

for (i in 1:length(sotu_clinton_bush)){
  temp_economy <- unlist(sotu_clinton_bush[[i]]$economy_text)
  if (length(temp_economy) == 0){
    next
  }
  temp_speech <- names(sotu_clinton_bush[i])
  temp_df <- data.frame(speech = temp_speech, 
                        economy_text = temp_economy)
  economy_speeches <- bind_rows(economy_speeches, temp_df)
} # Combine to Single DF


economy_speeches$president <- ifelse(grepl("Clinton", economy_speeches$speech), 
                                      "Clinton", "Bush") # Add President ID


reduce_complexity <- function(text){
  text <- tolower(text) # Lower Case
  text <- tm::removePunctuation(text) # Punctuation
  text <- tm::removeNumbers(text) # Numbers
  text <- removeWords(text, tm::stopwords("english")) # Stop Words
  text <- unlist(stringr::str_split(text, '\\s+')) # Tokenize 
  text <- textstem::lemmatize_words(text) # Lemmatize
  text <- paste(text, collapse = ' ') # Re-Append
  text <- gsub("\\s{2,}", ' ', text) # 2 or More Spaces --> One Space
  text <- trimws(text) # White Space
  return(text)
} # Function to Process Text for Bag of Words


economy_speeches <- economy_speeches %>%
  mutate(economy_text_clean = sapply(economy_text, reduce_complexity)) # Apply Complexity Reduction

sotu_corpus <- quanteda::corpus(economy_speeches, text_field = "economy_text_clean") # Convert to Corpus Object

sotu_tokens <- quanteda::tokens(sotu_corpus) # Recover Tokens from Corpus Object

sotu_dfm <- dfm(sotu_tokens) %>%
  dfm_trim(min_termfreq = 2)  # Convert to DFM -- Remove Words w/ Less Than 2 Appearances

topfeatures(sotu_dfm, 20) # 20-top Features (Words)
##    trade     will      new     year american     good      job  america 
##       67       64       64       55       49       48       45       45 
##     must   market     work    world  economy   people    every       us 
##       43       43       38       35       33       30       27       24 
##      can      way   worker   create 
##       24       23       23       23
# 1. C

president_dfm <- dfm_group(sotu_dfm, groups = economy_speeches$president) # Group DFM by President

# View top words per president
top_words_president <- quanteda.textstats::textstat_frequency(sotu_dfm, n = 15, groups = economy_speeches$president) # Top Words by Admin


quanteda.textplots::textplot_wordcloud(president_dfm, comparison = TRUE, max_words = 100,
                   color = c("blue", "red"))


  1. Complete the same tasks with two document groups of your choice. The only requirements are that you must have (at minimum) 20 documents and 2 groups, and you cannot use State of the Union addresses (5pts)