Note: Students should always aim to produce
publication-worthy tables and figures. Unless otherwise stated,
tables should be rendered using stargazer::(), while
figures can be rendered using ggplot2::() or
plot(). Regardless, tables and figures should always be
presented with necessary formatting – e.g., (sub)title, axis (variable)
labels and titles, a clearly-identifiable legend and key, etc. Problem
sets must always be compiled using LaTex or
RMarkdown and include the full coding routine (with notes
explaining your implementation) used to complete each problem
(10pts).
gutenbergr(), recover the text for H.G. Wells’
The War of the Worlds (gutenberger id = 36).
Recover the number of total words and unique words found in each chapter
(3 pts).war <- gutenbergr::gutenberg_download(36) # Download War of the Worlds
chapter_ids <- paste0(as.character(as.roman(1:26)), '.') # Roman Number Chapter IDs
war <- war %>%
mutate(chapter_start = ifelse(text %in% chapter_ids, 1, 0), # Identify Chapter Starts
chapter_name = ifelse(chapter_start == 1, paste(text, lead(text)), NA)) %>% # Name Chapters
filter(!lag(chapter_start) == 1) %>% # Remove Title Rows
tidyr::fill(chapter_name, .direction = 'down') %>% # Assign Chapter Down
filter(!is.na(chapter_name), !text %in% c('', ' '), !chapter_start == 1) %>% # Remove Header, Empty Rows, Chapter Start Row
select(-c(chapter_start, gutenberg_id))
war <- tibble::as_tibble(
war %>%
group_by(chapter_name) %>%
summarize(text = paste(text, collapse = " ")) %>%
tidytext::unnest_tokens(word, text) %>%
group_by(chapter_name) %>%
summarize(
word_count = n(),
unique_words = n_distinct(word)) %>%
rename('Chapter Name' = chapter_name,
'Word Count' = word_count,
'Unique Words' = unique_words)
) # As Tibble
stargazer::stargazer(war, type = 'text', summary = F)
##
## ==========================================================================================
## Chapter Name Word Count Unique Words
## ------------------------------------------------------------------------------------------
## 1 I. THE EVE OF THE WAR. 2227 828
## 2 I. UNDER FOOT. 2607 878
## 3 II. THE FALLING STAR. 1341 522
## 4 II. WHAT WE SAW FROM THE RUINED HOUSE. 3320 1157
## 5 III. ON HORSELL COMMON. 1019 461
## 6 III. THE DAYS OF IMPRISONMENT. 1958 719
## 7 IV. THE CYLINDER OPENS. 1125 475
## 8 IV. THE DEATH OF THE CURATE. 1612 604
## 9 IX. THE FIGHTING BEGINS. 2126 753
## 10 IX. WRECKAGE. 1847 727
## 11 V. THE HEAT-RAY. 1488 555
## 12 V. THE STILLNESS. 965 428
## 13 VI. THE HEAT-RAY IN THE CHOBHAM ROAD. 834 419
## 14 VI. THE WORK OF FIFTEEN DAYS. 1253 530
## 15 VII. HOW I REACHED HOME. 1317 543
## 16 VII. THE MAN ON PUTNEY HILL. 5590 1514
## 17 VIII. DEAD LONDON. 3104 973
## 18 VIII. FRIDAY NIGHT. 980 452
## 19 X. IN THE STORM. 2256 763
## 20 X. THE EPILOGUE. 1378 575
## 21 XI. AT THE WINDOW. 2032 706
## 22 XII. WHAT I SAW OF THE DESTRUCTION OF WEYBRIDGE AND SHEPPERTON. 3863 1183
## 23 XIII. HOW I FELL IN WITH THE CURATE. 1781 703
## 24 XIV. IN LONDON. 3756 1207
## 25 XV. WHAT HAD HAPPENED IN SURREY. 2820 974
## 26 XVI. THE EXODUS FROM LONDON. 4407 1240
## 27 XVII. THE “THUNDER CHILD”. 3297 1100
## ------------------------------------------------------------------------------------------
west_wing_script_location <- "https://raw.githubusercontent.com/JakeTruscott/CSS_POS_UF/main/docs/assets/replication_materials/class_4/supplemental_materials/West_Wing_S2_E22.txt"
west_wing <- readLines(west_wing_script_location, warn = FALSE) # Read Txt from GitHub Repo
character_stage <- unique(west_wing[grepl("^[^a-z]*$", west_wing)])
character_stage <- character_stage[!character_stage == '']
characters <- c('Leo', 'Toby', 'Josh', 'C.J.', 'Sam', 'Charlie', 'Bartlet')
character_regex <- paste0("^(", paste0(toupper(characters), collapse = "|"), ")$")
west_wing <- data.frame(unlist(west_wing)) %>%
setNames('text') %>%
mutate(character_line = ifelse(stringr::str_detect(text, character_regex), 1, 0),
dialogue_break = ifelse(lead(text) %in% c(character_stage), 1, 0),
empty_row = ifelse(text == '', 1, 0),
first_entry = ifelse(character_line == 1, 1, NA)) %>%
tidyr::fill(first_entry, .direction = 'down') %>%
filter(!is.na(first_entry)) %>%
select(-c(first_entry)) %>%
mutate(group = cumsum(character_line == 1)) %>%
group_by(group) %>%
mutate(to_keep = row_number() < which(empty_row == 1)[1] | is.na(which(empty_row == 1)[1])) %>%
ungroup() %>%
filter(to_keep) %>%
select(text, character_line) %>%
mutate(group = cumsum(character_line == 1)) %>%
group_by(group) %>%
summarise(
character = text[character_line == 1][1],
dialogue = paste(text[-1], collapse = " "),
.groups = "drop") %>%
rename(id = group) %>%
mutate(dialogue = gsub("\\[.*?\\]", '', dialogue), # Remove Hard Brackets
dialogue = trimws(dialogue)) %>%
select(character, dialogue, id) %>%
rowwise() %>%
mutate(word_count = stringr::str_count(dialogue, "\\S+")) %>%
ungroup() %>%
filter(!word_count == 0)
two_cathedrals <- tibble::as_tibble(
west_wing %>%
group_by(character) %>%
summarise(total_words = sum(word_count),
average_words = round(mean(word_count)),
total_lines = n()) %>%
arrange(desc(total_words)) %>%
rename(Character = character,
`Total Words` = total_words,
`Average Words` = average_words,
`Total Lines` = total_lines)
)
stargazer::stargazer(two_cathedrals, type = 'text', summary = F)
##
## =================================================
## Character Total Words Average Words Total Lines
## -------------------------------------------------
## 1 BARTLET 735 9 78
## 2 C.J. 638 13 51
## 3 TOBY 283 7 41
## 4 LEO 223 6 39
## 5 JOSH 217 9 23
## 6 SAM 214 10 22
## 7 CHARLIE 146 8 19
## -------------------------------------------------
quanteda. Make sure you also assign necessary
metadata – specifically character (speaker) and the dialogue id numbers
indicating (i) the order in which this dialogue appears in the
episode, as well as (ii) which order of dialogue this is for that
particular character. Be sure to render a summary of the corpus object
using summary(corpus_object) (3 pts).two_cathedrals <- west_wing %>%
group_by(character) %>%
mutate(character_dialogue_id = row_number()) %>%
rename(episode_dialogue_id = id) %>%
select(character, dialogue, character_dialogue_id, episode_dialogue_id)
two_cathedrals_corpus <- quanteda::corpus(two_cathedrals, text_field = 'dialogue')
summary(two_cathedrals_corpus[1:10]) # Print Summary
## Corpus consisting of 10 documents, showing 10 documents:
##
## Text Types Tokens Sentences character character_dialogue_id
## text1 2 2 1 LEO 1
## text2 2 2 1 LEO 2
## text3 2 2 1 LEO 3
## text4 10 10 1 LEO 4
## text5 12 14 1 LEO 5
## text6 14 16 2 LEO 6
## text7 12 12 1 TOBY 1
## text8 27 30 3 TOBY 2
## text9 22 23 2 TOBY 3
## text10 2 2 1 SAM 1
## episode_dialogue_id
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10