rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) library(Hmisc) library(cowplot) ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText ) summary(testCorpus) testCorpus <- corpus(myText, docid_field = "doc_id") summary(testCorpus) docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) # Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice. # For example, given that in Wordfish word-fixed effects are estimated, removing stopwords not needed as in other programs... myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Using wordfish ######################################################################### ######################################################################### # dir indicates which two documents are used for global identification purposes # (first document to the left of the second one); this matters usually more for the interpretatio of the results (i.e., for the direction of the scores # along the latent dimension (which positive, which negative ones)), rather than for the estimation per-se # here: LAB 92 to the left of CONS 92 summary(testCorpus) wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) summary(wfm) str(wfm) # here: CONS 92 to the left of LAB 92 wfm2 <- textmodel_wordfish(myDfm, dir = c(1, 3)) summary(wfm2) # always do Diagnostic! # A good start for diagnostics is the analysis of word discrimination parameters. # Weights with large values mean that these words are estimated to be on the extremes of the dimension # Plot estimated word positions textplot_scale1d(wfm, margin = "features") textplot_scale1d(wfm, margin = "features", highlighted = c("government", "global", "children", "bank", "economy", "citizenship", "productivity", "deficit", "april"), highlighted_color = "red") # Plot estimated document positions textplot_scale1d(wfm, margin = "documents") textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Party")) textplot_scale1d(wfm, margin = "documents", groups = docvars(testCorpus, "Year")) # extract estimates of the model and save them str(wfm) words2 <- wfm@features beta2 <-wfm@beta psi2 <-wfm@psi scores_words2 <-data.frame(words2, beta2, psi2) str(scores_words2) write.csv(scores_words2, "result_wordfish_words.csv") party <- wfm@docs theta <-wfm@theta se.theta <-wfm@se.theta scores_texts <-data.frame(party , theta , se.theta) str(scores_texts) scores_texts$lower <- scores_texts$theta +1.96*scores_texts$se.theta scores_texts$upper <- scores_texts$theta -1.96*scores_texts$se.theta str(scores_texts) write.csv(scores_texts, "result_wordfish_texts.csv") ######################################################################### ######################################################################### # Let's compare the results we get from Wordfish with the row score ones # we get from Wordscores using the economic policy position ######################################################################### ######################################################################### # set reference scores ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) summary(ws) pr_all <- predict(ws, newdata = myDfm) summary(pr_all) summary(wfm) # Comparing wordscores vs wordfish library(cowplot) wordscores <- textplot_scale1d(pr_all, margin = "documents") wordfish <- textplot_scale1d(wfm, margin = "documents") plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish')) # insights: a) same movement of parties between 1992 and 1997; b) same position of cons but not of other 2 parties # what are we measuring with wordfish goes beyond "economic policy" issues? # check for the correlation party <- wfm@docs score_wf <-wfm@theta score_ws <- pr_all@textscores$textscore_raw scores_texts <-data.frame(party, score_wf, score_ws) str(scores_texts) rcorr(scores_texts$score_ws, scores_texts$score_wf) # you can also draw a scatter, with a fit lines and party names plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", xlab="Wordscores", ylab="Wordfish", pch=19) text(scores_texts$score_ws, scores_texts$score_wf,, labels = scores_texts$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) ######################################################################### ######################################################################### # Using wordfish: US Presidential Inaugural Speech after 1980 ######################################################################### ######################################################################### # apply wordfish by first considering Reagan 1981 to the right of Obama 2009; # and then Trump 2017 to the right of Obama 2009: any change? Compare such results to what you got in wordscores Assignment 2