rm(list=ls(all=TRUE)) getwd() # set here the folder where you have saved the datasets you are going to use in this class # setwd("*****") # in my case setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) library(cowplot) ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText ) summary(testCorpus) testCorpus <- corpus(myText, docid_field = "doc_id") summary(testCorpus) docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) # Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice. # For example, given that in Wordfish word-fixed effects are estimated, removing stopwords not needed as in other programs... myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Using wordfish ######################################################################### ######################################################################### # READ the pdf file! # dir indicates which two documents are used for global identification purposes # (first document to the left of the second one); this matters usually more for the interpretatio of the results (i.e., for the direction of the scores # along the latent dimension (which positive, which negative ones)), rather than for the estimation per-se # here: LAB 92 to the left of CONS 92 summary(testCorpus) wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) summary(wfm) str(wfm) # here: CONS 92 to the left of LAB 92 wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3)) summary(wfm2) # compare with previous case! summary(wfm) summary(wfm2) cor(wfm$theta, wfm2$theta) # and what if now LIB97 to the left of CONS 97? summary(testCorpus) wfm3 <- textmodel_wordfish(myDfm, dir = c(6, 2)) summary(wfm3) # compare with the first case: cor(wfm$theta, wfm3$theta) # always do Diagnostic! # A good start for diagnostics is the analysis of word discrimination parameters. # Weights with large values mean that these words are estimated to be on the extremes of the dimension # Plot estimated word positions textplot_scale1d(wfm, margin = "features") textplot_scale1d(wfm, margin = "features", highlighted = c("government", "global", "children", "bank", "economy", "citizenship", "productivity", "deficit", "april"), highlighted_color = "red") # Plot estimated document positions textplot_scale1d(wfm, margin = "documents") textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Party")) textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Year")) # extract estimates of the model and save them str(wfm) words <- wfm$features beta <-wfm$beta psi <-wfm$psi scores_words <-data.frame(words, beta, psi) str(scores_words) head(scores_words) # showing the 10 words with the largest positive beta value step <-order(-scores_words$beta ) scores_words<- scores_words[step , ] head(scores_words, 10) # showing the 10 words with the largest negative beta value tail(scores_words, 10) # save the results in a csv file write.csv(scores_words, "result_wordfish_words.csv") party <- wfm$docs theta <-wfm$theta se.theta <-wfm$se.theta scores_texts <-data.frame(party , theta , se.theta) str(scores_texts) scores_texts$lower <- scores_texts$theta +1.96*scores_texts$se.theta scores_texts$upper <- scores_texts$theta -1.96*scores_texts$se.theta str(scores_texts) # save the results in a csv file write.csv(scores_texts, "result_wordfish_texts.csv") ######################################################################### # Let's compare the results we get from Wordfish with the row score ones # we get from Wordscores using the economic policy position ######################################################################### ######################################################################### # set reference scores ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) summary(ws) pr_all <- predict(ws, se.fit = TRUE) pr_all doclab <- docnames(testCorpus) doclab # Comparing wordscores vs wordfish wordscores <- textplot_scale1d(pr_all, margin = "documents", doclabels = doclab) wordfish <- textplot_scale1d(wfm, margin = "documents") plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish')) # insights: a) same movement of parties between 1992 and 1997: that's a good thing!; # b) same position of cons but not of other 2 parties: what are we measuring with wordfish goes beyond "economic policy" issues? # check for the correlation party <- wfm$docs score_wf <-wfm$theta score_ws <- pr_all$fit scores_texts <-data.frame(party, score_wf, score_ws) str(scores_texts) cor(scores_texts$score_ws, scores_texts$score_wf) # you can also draw a scatter, with a fit lines and party names plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", xlab="Wordscores", ylab="Wordfish", pch=19) text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) ######################################################################### ######################################################################### # Using wordfish: US Presidential Inaugural Speech after 1980 ######################################################################### ######################################################################### # apply wordfish by first considering Reagan 1981 to the right of Obama 2009; # and then Trump 2017 to the right of Obama 2009: any change? Compare such results to what you got in wordscores when # Obama 2009=-1, and Reagan 1981=1 OR Trump 2017=1 # create a dfm from inaugural addresses from Reagan onwards presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove = stopwords("english"), stem = TRUE, remove_punct = TRUE) presDfm [, 1:5] # Obama 2009 to the left of Reagan 1981 wfm <- textmodel_wordfish(presDfm , dir = c(8, 1)) summary(wfm) # Plot estimated word positions textplot_scale1d(wfm, margin = "features") # Plot estimated document positions textplot_scale1d(wfm, margin = "documents") # Obama 2009 to the left of Trump 2017 wfm2 <- textmodel_wordfish(presDfm , dir = c(8, 10)) summary(wfm2) # Plot estimated document positions textplot_scale1d(wfm2, margin = "documents") # Comparing wordfish with Reagan or Trump in "dir" # library(cowplot) reagan <- textplot_scale1d(wfm, margin = "documents") trump <- textplot_scale1d(wfm2, margin = "documents") plot_grid(reagan , trump , labels = c('Reagan=on the right of Obama', 'Trump=on the right of Obama')) # check for the correlation score_reagan <-wfm$theta score_trump <-wfm2$theta cor(score_reagan, score_trump) # estimate wordscores with Reagan 1981=1 and Obama 2009=-1 ws_pres <- textmodel_wordscores(presDfm, c(1, NA, NA, 0, NA, NA, NA, -1, NA, NA)) summary(ws_pres) # scaling all texts (including the reference ones) pr_all_pres <- predict(ws_pres, se.fit = TRUE ) pr_all_pres # estimates wordscores with Trump 2017=1 and Obama 2009=-1 ws_pres2 <- textmodel_wordscores(presDfm, c(NA, NA, NA, 0, NA, NA, NA, -1, NA, 1)) summary(ws_pres2) # scaling all texts (including the reference ones) pr_all_pres2 <- predict(ws_pres2, se.fit = TRUE) pr_all_pres2 summary(corpus_subset(data_corpus_inaugural, Year > 1980)) doclab <- docnames(corpus_subset(data_corpus_inaugural, Year > 1980)) doclab # Comparing wordscores with Reagan or Trump in "dir" reagan_ws <- textplot_scale1d(pr_all_pres, margin = "documents", doclabels = doclab) trump_ws <- textplot_scale1d(pr_all_pres2 , margin = "documents", doclabels = doclab) plot_grid(reagan , trump , reagan_ws, trump_ws, labels = c('Reagan_WF', 'Trump_WF', "Reagan_WS=1", "Trump_WS=1")) # check for the correlation score_ws_reagan <- pr_all_pres$fit score_ws_trump <- pr_all_pres2$fit set_scores <- data.frame(score_reagan, score_trump, score_ws_reagan, score_ws_trump) str(set_scores) cor(set_scores, use="complete.obs", method="kendall") # Compared to wordscores, you see that a) the results do not change according to document selection in "dir"; b) on the contary, # the results you get in wordscores are highly sensitive to that; # c) finally wordscores with Reagan=1 much more resemble the results you get via Wordfish