rm(list=ls(all=TRUE)) getwd() # set here the folder where you have saved the datasets you are going to use in this class # setwd("*****") # in my case setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) library(ggplot2) library(cowplot) ######################################################################### ######################################################################### # Creating the Corpus of the UK electoral programs 1992, 1997 ######################################################################### ######################################################################### myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText) summary(testCorpus) # I want to name the texts in my just created corpus in a more proper way! # several possibilities: # first one testCorpus <- corpus(myText, docid_field = "doc_id") summary(testCorpus) docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) # second one testCorpus2 <- corpus(myText) summary(testCorpus2) docnames(testCorpus2) <- paste(myText$Party, myText$Year, sep = " ") summary(testCorpus2) # As discussed, always a good idea to delete stopwords when using Wordscores! myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) ######################################################################### ######################################################################### # Checking for length of texts, drop unique words and check correlation across documents ######################################################################### ######################################################################### summary(testCorpus) # keep only words occurring >1 times myDfm <- dfm_trim(myDfm, min_termfreq = 1) # compute some document similarities [ Simil <- textstat_simil(myDfm , margin = "documents", method = "correlation") Simil ######################################################################### ######################################################################### # Using wordscores: UK example with economic policy positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 5.35; LibDem: 8.21; Cons: 17.21 # reference scores derived from an expert survey # FIRST step: # Set reference scores refscores <- c(17.21, NA, 5.35, NA, 8.21, NA) refscores # SECOND step: # Assign the reference scores to your dfm ws <- textmodel_wordscores(myDfm, refscores) summary(ws) # Plot estimated word positions in the reference texts (highlight words and print them in red) # it shows the frequency vs. the word-score textplot_scale1d(ws) textplot_scale1d(ws, highlighted = c( "budget", "green", "millennium"), highlighted_color = "red") # Doing FIRST and SECOND step in one single step ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA)) summary(ws2) # alternative way to set reference scores ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) summary(ws3) # THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones) pr_raw <- predict(ws, se.fit = TRUE, newdata = myDfm) pr_raw textplot_scale1d(pr_raw) # alternative way (with c.i. rather than with s.e.) pr_all2 <- predict(ws, interval = "confidence") pr_all2 textplot_scale1d(pr_all2) # Plot estimated document positions and group by "party" or "year" variable summary(testCorpus) textplot_scale1d(pr_all2, margin = "documents", groups = docvars(testCorpus, "Party")) textplot_scale1d(pr_all2, margin = "documents", groups = docvars(testCorpus, "Year")) # we want to predict only the virgin texts using the rescaling LGB option summary(ws) pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ]) pr_lbg # obtaining the corresponding confidence interval pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence") pr_lbg # save the estimates - features scores words <- ws$wordscores str(words) write.csv(words, "result_wordscores_words.csv") # save the estimates - document scores str(pr_all2) pr_all2 pr_all2$fit ch <- as.data.frame(pr_all2$fit) ch$Party <- rownames(ch) str(ch) write.csv(ch, "parties.csv") #### ALTERNATIVE ways to plot confidence intervals with ggplot2 [raw scores] str(ch) ch$ci <- ch$fit-ch$lwr str(ch) ggplot(ch, aes(x=reorder (Party, fit), y=fit, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") #### ALTERNATIVE ways to plot confidence intervals with ggplot2 [LBG transformed scores] # Plot estimated document positions [transformed scores - LBG] pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence") pr2 str(pr2) pr2$fit rescaled <- as.data.frame(pr2$fit) rescaled$Party <- rownames(rescaled) str(rescaled ) str(rescaled ) rescaled $ci <- rescaled $fit-rescaled $lwr str(rescaled ) ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") # if you want to add to the previous graph also the reference scores: str(ws) fit <- ws$y Party <- ws$x@Dimnames$docs original_scores <-data.frame(Party , fit) original_scores$Party <- as.character(original_scores$Party ) str(original_scores) original_scores <- na.omit(original_scores) original_scores$lwr <- 0 original_scores$upr <- 0 str(original_scores) original_scores$ci <- original_scores$upr-original_scores$lwr str(original_scores) str(rescaled) rescaled <- rbind(rescaled, original_scores) str(rescaled ) ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") ######################################################################### ######################################################################### # Using wordscores: UK example with social policy positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34 # Run the analysis by focusing on the raw scores ######################################################################### ######################################################################### # Wordscores vs. Wordfish ######################################################################### ######################################################################### ######################################################################### # Let's compare the results we get from Wordfish with the raw score ones # we get from Wordscores using the economic policy position ######################################################################### ######################################################################### # Wordscores ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) pr_all <- predict(ws, interval = "confidence") # Wordfish with LAB 92 to the left of CONS 92 wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) # Comparing wordscores vs wordfish wordscores <- textplot_scale1d(pr_all) wordfish <- textplot_scale1d(wfm, margin = "documents") plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish')) # insights: a) same movement of parties between 1992 and 1997: that's a good thing!; # b) same position of Cons but not of other 2 parties: what are we measuring with wordfish goes beyond "economic policy" issues? # check for the correlation party <- wfm$docs score_wf <-wfm$theta score_ws <- pr_all$fit scores_texts <-data.frame(party, score_wf, score_ws) str(scores_texts) colnames(scores_texts)[3] <- "score_ws" str(scores_texts) cor(scores_texts$score_ws, scores_texts$score_wf) # you can also draw a scatter, with a fit lines and party names plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", xlab="Wordscores", ylab="Wordfish", pch=19) text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) ######################################################################### ######################################################################### # Replicate the comparison between Wordfish and Wordscores estimates using # for Wordscores the social policy position ######################################################################### ######################################################################### abline(lm(scores_texts_soc$score_wf ~scores_texts_soc$score_ws ), col="red") # regression line (y~x)