rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) library(quanteda.textmodels) library(quanteda.textplots) library(quanteda.textstats) library(ggplot2) ######################################################################### ######################################################################### # Creating and Working with a Corpus # Let's create a corpus based on the UK parties' manifesto from 1992 and 1997 general elections ######################################################################### ######################################################################### myText <- readtext("C:/Users/luigi/Dropbox/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) # Text pre-processing: let's make some cleaning; # for example the apostrophe (Quanteda struggles with that...) myText$text <- gsub("'"," ",myText$text) testCorpus <- corpus(myText ) summary(testCorpus) # I rename the name of the documents docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) # Alternatively I could create a new variable myText$code <- paste(myText$Party, as.character(myText$Year), sep = " ") # and the pass it as the name of the documents to be considered testCorpus2 <- corpus(myText, docid_field = "code") summary(testCorpus2) # Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice. tok2 <- tokens(testCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) myDfm <- dfm(tok2 ) topfeatures(myDfm , 20) # 20 top words # let's keep just those features with at least 2 characters (to remove for example the "s") myDfm <- dfm_remove(myDfm, min_nchar=2) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Check similarity across documents ######################################################################### ######################################################################### summary(testCorpus) # compute some document similarities Simil <- textstat_simil(myDfm , method = "cosine") Simil ######################################################################### ######################################################################### # Using wordscores: UK example with economic policy positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos - Lab: 5.35; LibDem: 8.21; Cons: 17.21 (the lower (higher) the score the more (less) pro-State in economy) # reference scores derived from the Laver-Garry expert survey # FIRST step: # Set reference scores refscores <- c(17.21, NA, 5.35, NA, 8.21, NA) refscores # SECOND step: # Assign the reference scores to your dfm ws <- textmodel_wordscores(myDfm, refscores) summary(ws) # Plot estimated word positions in the reference texts (highlight words and print them in red) # it shows the frequency vs. the word-score textplot_scale1d(ws, highlighted = c( "budget", "green", "millennium"), highlighted_color = "red") # Doing FIRST and SECOND step in one single step ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA)) summary(ws2) # alternative way to set reference scores ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) summary(ws3) # THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones) pr_raw <- predict(ws, se.fit = TRUE, newdata = myDfm) # Why that warning? Cause we have more words in the DfM than in the reference texts! So some words included in the DfM # cannot get a wordscore out of the reference texts! nfeat(myDfm) # 6559 features in our DfM in our corpus length(ws$wordscores) # 5061 features in the reference texts only nfeat(myDfm)-length(ws$wordscores) # let's see the results pr_raw textplot_scale1d(pr_raw) # alternative way (with c.i. rather than with s.e.) pr_all2 <- predict(ws, interval = "confidence", newdata = myDfm) pr_all2 textplot_scale1d(pr_all2) # Plot estimated document positions and group by "party" or "year" variable summary(testCorpus) textplot_scale1d(pr_all2, margin = "documents", groups = docvars(testCorpus, "Party")) textplot_scale1d(pr_all2, margin = "documents", groups = docvars(testCorpus, "Year")) # we want to predict only the virgin texts using the rescaling LGB option summary(ws) pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ]) pr_lbg # obtaining the corresponding confidence interval pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence") pr_lbg ######################################################################### ######################################################################### # Using wordscores: UK example with decentralization positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 10.21, LibDem: 5.26; Cons: 15.61 (the higher (lower) the score the less (more) pro-decentralization) # Run the analysis by focusing on the raw scores # FIRST step: # Set reference scores refscores <- c(15.61, NA, 10.21, NA, 5.26, NA) refscores # SECOND step: # Assign the reference scores to your dfm ws <- textmodel_wordscores(myDfm, refscores) summary(ws) # THIRD step: we predict the raw Wordscores for all the texts (reference and virgin ones) pr_all2 <- predict(ws, interval = "confidence") pr_all2 textplot_scale1d(pr_all2) # let's compare results we got via economic vs. social policy dimension # Wordscores ws <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) pr_eco <- predict(ws, interval = "confidence") eco <- textplot_scale1d(pr_eco) dec <- textplot_scale1d(pr_all2) plot_grid(eco , soc , labels = c('Economic', 'Decentralization')) str(ws) str(pr_all2) str(pr_eco) # check for the correlation party <- ws$x@Dimnames$docs score_dec <- pr_all2$fit score_eco <- pr_eco$fit scores_texts <-data.frame(party, score_dec, score_eco ) str(scores_texts) colnames(scores_texts)[2] <- "scoreDEC" colnames(scores_texts)[5] <- "scoreECO" str(scores_texts) cor(scores_texts$scoreDEC, scores_texts$scoreECO) # Plotting the 2-D policy space ggplot(scores_texts, aes(x=scoreECO, y=scoreDEC)) + geom_point() + geom_text(label=scores_texts$party, vjust=-1) + ylab(label="Decentralization policy") + xlab("Economic policy") + theme_light()