Note: Students should always aim to produce
publication-worthy tables and figures. Unless otherwise stated,
tables should be rendered using stargazer::(), while
figures can be rendered using ggplot2::() or
plot(). Regardless, tables and figures should always be
presented with necessary formatting – e.g., (sub)title, axis (variable)
labels and titles, a clearly-identifiable legend and key, etc. Problem
sets must always be compiled using LaTex or
RMarkdown and include the full coding routine (with notes
explaining your implementation) used to complete each problem
(10pts).
Using the sotu() library, recover the State of the
Union addresses for Presidents Clinton and (W.) Bush between 1993-2008.
Complete the following tasks:
Select a policy sphere (e.g., Military,
Economy, etc.). Using a for loop, select terms
that you believe represent that policy sphere and partition the
speeches. In some form, provide a table that prints the number of
sentences recovered for each speech (2pts)
Construct a function that reduces the complexity of
your texts – including, but not limited to: lemmatizing and removing
stopwords, punctuation, and numerals, etc.. Afterwards, use a
for loop to apply the function. Convert to a corpus DFM
then print the top-20 most frequent words shared among the
administrations’ speeches (2pts)
Construct a word cloud graphic using
textplot_wordcloud() (or similar) (1pt).
# 1.A
sotu_info <- sotu::sotu_meta %>%
filter(president %in% c('William J. Clinton', 'George W. Bush')) # Get Info for Eisenhower and H.W.
indices <- c(sotu_info$X) # Indices to Partition sotu_text
sotu_clinton_bush <- setNames(
lapply(seq_len(nrow(sotu_info)), function(i) {
cbind(sotu_info[i, ], text = sotu::sotu_text[[indices[i]]])
}),
paste0(sotu_info$president, " (", sotu_info$year, ")")
) # Nest Each Speech in List
economy_words_regex <- paste0('(', paste(c('trade', 'recession', 'inflation', 'productivity', 'efficiency', 'markets', 'trade', 'labor', 'industry', 'output'), collapse = '|'), ')') # "Economy" Words Regex
for (speech in 1:length(sotu_clinton_bush)){
temp_speech <- sotu_clinton_bush[[speech]]
temp_speech <- data.frame(stringr::str_split(temp_speech$text, pattern = '\\n')) %>%
setNames('text') %>%
filter(!text == '') # Grab Speech -- Partition to Sentences
sotu_clinton_bush[[speech]]$text <- list(temp_speech) # Append Back to Original
economy_sentences <- temp_speech %>%
filter(grepl(economy_words_regex, text, ignore.case = T)) # All Sentences w/ "economy" Words
sotu_clinton_bush[[speech]]$economy_text <- list(economy_sentences) # Append
} # Process Speeches & Isolate "economy" Sentences
for (speech in 1:length(sotu_clinton_bush)){
temp_speech_name <- names(sotu_clinton_bush[speech])
economy_sentences <- length(unlist(sotu_clinton_bush[[speech]]$economy_text))
cat(temp_speech_name, ' -- ', economy_sentences, ' Sentences \n')
} # Prints # of "economy" Sentences Per Speech
## William J. Clinton (1993) -- 10 Sentences
## William J. Clinton (1994) -- 6 Sentences
## William J. Clinton (1995) -- 6 Sentences
## William J. Clinton (1996) -- 5 Sentences
## William J. Clinton (1997) -- 6 Sentences
## William J. Clinton (1998) -- 10 Sentences
## William J. Clinton (1999) -- 12 Sentences
## William J. Clinton (2000) -- 12 Sentences
## George W. Bush (2001) -- 4 Sentences
## George W. Bush (2002) -- 6 Sentences
## George W. Bush (2003) -- 5 Sentences
## George W. Bush (2004) -- 5 Sentences
## George W. Bush (2005) -- 2 Sentences
## George W. Bush (2006) -- 3 Sentences
## George W. Bush (2007) -- 2 Sentences
## George W. Bush (2008) -- 4 Sentences
# 1.B
economy_speeches <- data.frame()
for (i in 1:length(sotu_clinton_bush)){
temp_economy <- unlist(sotu_clinton_bush[[i]]$economy_text)
if (length(temp_economy) == 0){
next
}
temp_speech <- names(sotu_clinton_bush[i])
temp_df <- data.frame(speech = temp_speech,
economy_text = temp_economy)
economy_speeches <- bind_rows(economy_speeches, temp_df)
} # Combine to Single DF
economy_speeches$president <- ifelse(grepl("Clinton", economy_speeches$speech),
"Clinton", "Bush") # Add President ID
reduce_complexity <- function(text){
text <- tolower(text) # Lower Case
text <- tm::removePunctuation(text) # Punctuation
text <- tm::removeNumbers(text) # Numbers
text <- removeWords(text, tm::stopwords("english")) # Stop Words
text <- unlist(stringr::str_split(text, '\\s+')) # Tokenize
text <- textstem::lemmatize_words(text) # Lemmatize
text <- paste(text, collapse = ' ') # Re-Append
text <- gsub("\\s{2,}", ' ', text) # 2 or More Spaces --> One Space
text <- trimws(text) # White Space
return(text)
} # Function to Process Text for Bag of Words
economy_speeches <- economy_speeches %>%
mutate(economy_text_clean = sapply(economy_text, reduce_complexity)) # Apply Complexity Reduction
sotu_corpus <- quanteda::corpus(economy_speeches, text_field = "economy_text_clean") # Convert to Corpus Object
sotu_tokens <- quanteda::tokens(sotu_corpus) # Recover Tokens from Corpus Object
sotu_dfm <- dfm(sotu_tokens) %>%
dfm_trim(min_termfreq = 2) # Convert to DFM -- Remove Words w/ Less Than 2 Appearances
topfeatures(sotu_dfm, 20) # 20-top Features (Words)
## trade will new year american good job america
## 67 64 64 55 49 48 45 45
## must market work world economy people every us
## 43 43 38 35 33 30 27 24
## can way worker create
## 24 23 23 23
# 1. C
president_dfm <- dfm_group(sotu_dfm, groups = economy_speeches$president) # Group DFM by President
# View top words per president
top_words_president <- quanteda.textstats::textstat_frequency(sotu_dfm, n = 15, groups = economy_speeches$president) # Top Words by Admin
quanteda.textplots::textplot_wordcloud(president_dfm, comparison = TRUE, max_words = 100,
color = c("blue", "red"))