rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) library(quanteda.textmodels) library(cowplot) library(PerformanceAnalytics) library(psych) library(quanteda.textplots) library(quanteda.textstats) library(ggplot2) ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### myText <- readtext("C:/Users/luigi/Dropbox/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) # Text pre-processing: let's make some cleaning; # for example the apostrophe (Quanteda struggles with that...) myText$text <- gsub("'"," ",myText$text) testCorpus <- corpus(myText ) summary(testCorpus) # I rename the name of the documents docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) # Alternatively I could create a new variable myText$code <- paste(myText$Party, as.character(myText$Year), sep = " ") # and the pass it as the name of the documents to be considered testCorpus2 <- corpus(myText, docid_field = "code") summary(testCorpus2) # Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice. tok2 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2 ) topfeatures(myDfm , 20) # 20 top words # let's keep just those features with at least 2 characters (to remove for example the "s") myDfm <- dfm_remove(myDfm, min_nchar=2) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Drop unique words and check similarity across documents ######################################################################### ######################################################################### summary(testCorpus) # keep only words occurring >1 times myDfm <- dfm_trim(myDfm, min_termfreq = 2) # compute some document similarities Simil <- textstat_simil(myDfm , method = "cosine") Simil ######################################################################### ######################################################################### # Using wordscores: UK example with economic policy positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 5.35; LibDem: 8.21; Cons: 17.21 # reference scores derived from an expert survey # FIRST step: # Set reference scores refscores <- c(17.21, NA, 5.35, NA, 8.21, NA) refscores # SECOND step: # Assign the reference scores to your dfm ws <- textmodel_wordscores(myDfm, refscores) summary(ws) # Plot estimated word positions in the reference texts (highlight words and print them in red) # it shows the frequency vs. the word-score textplot_scale1d(ws, highlighted = c( "budget", "green", "millennium"), highlighted_color = "red") # Doing FIRST and SECOND step in one single step ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA)) summary(ws2) # alternative way to set reference scores ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) summary(ws3) # THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones) pr_raw <- predict(ws, se.fit = TRUE, newdata = myDfm) pr_raw textplot_scale1d(pr_raw) # alternative way (with c.i. rather than with s.e.) pr_all2 <- predict(ws, interval = "confidence", newdata = myDfm) pr_all2 textplot_scale1d(pr_all2) # Plot estimated document positions and group by "party" or "year" variable summary(testCorpus) textplot_scale1d(pr_all2, margin = "documents", groups = docvars(testCorpus, "Party")) textplot_scale1d(pr_all2, margin = "documents", groups = docvars(testCorpus, "Year")) # we want to predict only the virgin texts using the rescaling LGB option summary(ws) pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ]) pr_lbg # obtaining the corresponding confidence interval pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence") pr_lbg ############################################# #### ALTERNATIVE ways to plot confidence intervals with ggplot2 [raw scores] ############################################# str(pr_all2) pr_all2 pr_all2$fit ch <- as.data.frame(pr_all2$fit) ch$Party <- rownames(ch) str(ch) ch$ci <- ch$fit-ch$lwr str(ch) ggplot(ch, aes(x=reorder (Party, fit), y=fit, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") #### ALTERNATIVE ways to plot confidence intervals with ggplot2 [LBG transformed scores] # Plot estimated document positions [transformed scores - LBG] pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence") pr2 str(pr2) pr2$fit rescaled <- as.data.frame(pr2$fit) rescaled$Party <- rownames(rescaled) str(rescaled ) str(rescaled ) rescaled $ci <- rescaled $fit-rescaled $lwr str(rescaled ) ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") # if you want to add to the previous graph also the reference scores: str(ws) fit <- ws$y Party <- ws$x@Dimnames$docs original_scores <-data.frame(Party , fit) original_scores$Party <- as.character(original_scores$Party ) str(original_scores) original_scores <- na.omit(original_scores) original_scores$lwr <- 0 original_scores$upr <- 0 str(original_scores) original_scores$ci <- original_scores$upr-original_scores$lwr str(original_scores) str(rescaled) rescaled <- rbind(rescaled, original_scores) str(rescaled ) ggplot(rescaled , aes(x=reorder (Party, fit), y=fit, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") ######################################################################### ######################################################################### # Using wordscores: UK example with social policy positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34 # Run the analysis by focusing on the raw scores # FIRST step: # Set reference scores refscores <- c(15.34, NA, 6.87, NA, 6.53, NA) refscores # SECOND step: # Assign the reference scores to your dfm ws <- textmodel_wordscores(myDfm, refscores) summary(ws) # THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones) pr_all2 <- predict(ws, interval = "confidence") pr_all2 textplot_scale1d(pr_all2) # let's compare results we got via economic vs. social policy dimension # Wordscores ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) pr_eco <- predict(ws, interval = "confidence") eco <- textplot_scale1d(pr_eco) soc <- textplot_scale1d(pr_all2) plot_grid(eco , soc , labels = c('Economic', 'Social')) str(ws) str(pr_all2) str(pr_eco) # check for the correlation party <- ws$x@Dimnames$docs score_soc <- pr_all2$fit score_eco <- pr_eco$fit scores_texts <-data.frame(party, score_soc, score_eco ) str(scores_texts) colnames(scores_texts)[2] <- "scoreSOC" colnames(scores_texts)[5] <- "scoreECO" str(scores_texts) cor(scores_texts$scoreSOC, scores_texts$scoreECO) # Plotting the 2-D policy space plot(scores_texts$scoreECO, scores_texts$scoreSOC, main = "UK 2-D policy space", xlab = "Economic scale", ylab = "Social Scale", pch = 19, frame = FALSE, xlim=c(9,12), ylim=c(9,11)) text(scores_texts$scoreECO, scores_texts$scoreSOC, labels = scores_texts$party, pos = 3) ######################################################################### ######################################################################### # Wordscores vs. Wordfish ######################################################################### ######################################################################### ######################################################################### # Let's compare the results we get from Wordfish with the raw score ones # we get from Wordscores using the economic policy position ######################################################################### ######################################################################### # Wordscores ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) pr_all <- predict(ws, interval = "confidence") # Wordfish with LAB 92 to the left of CONS 92 wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) # Comparing wordscores vs wordfish wordscores <- textplot_scale1d(pr_all) wordfish <- textplot_scale1d(wfm, margin = "documents") plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish')) # insights: a) same movement of parties between 1992 and 1997: that's a good thing!; # b) same position of Cons but not of other 2 parties: what are we measuring with wordfish goes beyond "economic policy" issues? # check for the correlation party <- wfm$docs score_wf <-wfm$theta score_ws <- pr_all$fit scores_texts <-data.frame(party, score_wf, score_ws) str(scores_texts) colnames(scores_texts)[3] <- "score_ws" str(scores_texts) cor(scores_texts$score_ws, scores_texts$score_wf) # you can also draw a scatter, with a fit lines and party names plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", xlab="Wordscores", ylab="Wordfish", pch=19) text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) ######################################################################### ######################################################################### # Replicate the comparison between Wordfish and Wordscores estimates using # for Wordscores the social policy position ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34 # Run the analysis by focusing on the raw scores # Wordscores ws <- textmodel_wordscores(myDfm, c(15.34, rep(NA,1), 6.87, rep(NA,1), 6.53, rep(NA,1))) pr_all <- predict(ws, interval = "confidence") # Wordfish with LAB 92 to the left of CONS 92 wfm <- textmodel_wordfish(myDfm, dir = c(3, 1)) # Comparing wordscores vs wordfish wordscores <- textplot_scale1d(pr_all) wordfish <- textplot_scale1d(wfm, margin = "documents") plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish')) # check for the correlation party <- wfm$docs score_wf <-wfm$theta score_ws <- pr_all$fit scores_texts_soc <-data.frame(party, score_wf, score_ws) str(scores_texts_soc) colnames(scores_texts_soc)[3] <- "score_ws" str(scores_texts_soc) # higher correlation with social policy rather than economic policy (.88 vs. .81). Wordfish scale more related to this dimension? cor(scores_texts_soc$score_ws, scores_texts_soc$score_wf) # you can also draw a scatter, with a fit lines and party names plot(scores_texts_soc$score_ws, scores_texts_soc$score_wf, main="Scatterplot", xlab="Wordscores", ylab="Wordfish", pch=19) text(scores_texts_soc$score_ws, scores_texts_soc$score_wf, labels = scores_texts_soc$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts_soc$score_wf ~scores_texts_soc$score_ws ), col="red") # regression line (y~x)