rm(list=ls(all=TRUE)) getwd() # set here the folder where you have saved the datasets you are going to use in this class # setwd("*****") # in my case setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) ######################################################################### ######################################################################### # Creating the Corpus of the UK electoral programs 1992, 1997 ######################################################################### ######################################################################### myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText) summary(testCorpus) # I want to name the texts in my just created corpus in a more proper way! # several possibilities: # first one testCorpus <- corpus(myText, docid_field = "doc_id") summary(testCorpus) docnames(testCorpus) <- gsub(".txt", "", docnames(testCorpus )) summary(testCorpus) # second one testCorpus2 <- corpus(myText) summary(testCorpus2) docnames(testCorpus2) <- paste(myText$Party, myText$Year, sep = " ") summary(testCorpus2) # Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words ######################################################################### ######################################################################### # Using wordscores: UK example with economic policy positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 5.35; LibDem: 8.21; Cons: 17.21 # Set reference scores refscores <- c(17.21, NA, 5.35, NA, 8.21, NA) refscores ws <- textmodel_wordscores(myDfm, refscores) summary(ws) # alternatively ws2 <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA)) summary(ws2) # alternative way to set reference scores ws3 <- textmodel_wordscores(myDfm, c(17.21, rep(NA,1), 5.35, rep(NA,1), 8.21, rep(NA,1))) summary(ws3) # predict the scores for the raw scores pr_raw <- predict(ws, newdata = myDfm[c(2, 4, 6), ]) pr_raw # rescale using the LBG transformation pr <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ]) pr # obtaining the corresponding confidence interval pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence") pr2 # scaling all texts (including the reference ones, and get the corresponding confidence interval) # look at the raw score of the LIBDEM 97 vs. LIBDEM 92 now, # and contrasts it with the previous situation...the same with CONS 92 vs. CONS 97 pr_all <- predict(ws, interval = "confidence") pr_all # Plot estimated word positions in the reference texts (highlight words and print them in red) # it shows the frequency vs. the word-score textplot_scale1d(ws) textplot_scale1d(ws, highlighted = c( "budget", "green", "millennium"), highlighted_color = "red") # Plot estimated document positions and group by "party" or "year" variable [row scores: this is easy!] summary(testCorpus) doclab <- docnames(testCorpus) doclab # Get predictions pred <- predict(ws, se.fit = TRUE) pred textplot_scale1d(pred, margin = "documents", doclabels = doclab) textplot_scale1d(pred, margin = "documents", doclabels = doclab, groups = docvars(testCorpus, "Party")) textplot_scale1d(pred, margin = "documents", doclabels = doclab, groups = docvars(testCorpus, "Year")) # save the estimates - features scores ws <- textmodel_wordscores(myDfm, refscores) str(ws) words <- ws$wordscores str(words) write.csv(words, "result_wordscores_words.csv") # save the estimates - document scores str(pr_all) pr_all pr_all$fit ch <- as.data.frame(pr_all$fit) ch$Party <- rownames(ch) str(ch) ch$se <- pr_all$se.fit str(ch) write.csv(ch, "parties.csv") # Alternative way for plotting estimated document positions [raw scores] # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step <-order(ch$fit ) ch<- ch[step , ] str(head(ch)) # Plotting the graph title <- "Parties positions with 95% Confidence Intervals" dotchart(ch$fit, labels=ch$Party , col="blue", xlim=c(floor(min(ch$lwr)), ceiling(max(ch$upr))), main=title ) for (i in 1:nrow(ch)){ lines(x=c(ch$lwr[i],ch$upr[i]), y=c(i,i)) } # Plot estimated document positions [transformed scores - LBG] pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(2, 4, 6), ], interval = "confidence") pr2 str(pr2) pr2$fit rescaled <- as.data.frame(pr2$fit) rescaled$Party <- rownames(rescaled) str(rescaled ) # ordering the parties in a ascending order step <-order(rescaled$fit ) rescaled<- rescaled[step, ] str(head(rescaled)) # Plotting the graph title <- "Parties positions with 95% Confidence Intervals" dotchart(rescaled$fit, labels=rescaled$Party , col="blue", xlim=c(floor(min(rescaled$lwr)), ceiling(max(rescaled$upr))), main=title ) for (i in 1:nrow(rescaled)){ lines(x=c(rescaled$lwr[i],rescaled$upr[i]), y=c(i,i)) } # if you want to add to the previous graph also the reference-scores: ws <- textmodel_wordscores(myDfm, refscores) str(ws) fit <- ws$y Party <- ws$x@Dimnames$docs original_scores <-data.frame(Party , fit) original_scores$Party <- as.character(original_scores$Party ) str(original_scores) original_scores <- na.omit(original_scores) str(original_scores) str(rescaled) original_scores$lwr <- 0 original_scores$upr <- 0 str(original_scores) mydata <- rbind(rescaled, original_scores) str(mydata) # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step <-order(mydata $fit ) mydata <- mydata [step, ] str(head(mydata )) # Plotting the graph title <- "Parties positions with 95% Confidence Intervals" dotchart(mydata $fit, labels=mydata $Party , col="blue", xlim=c(floor(min(mydata $lwr)), ceiling(max(mydata $upr))), main=title ) for (i in 1:nrow(mydata )){ lines(x=c(mydata $lwr[i],mydata $upr[i]), y=c(i,i)) } ######################################################################### ######################################################################### # Using wordscores: UK example with social policy positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34 # Run the analysis (with stemming!) by focusing on the row scores ######################################################################### ######################################################################### # Using wordscores: US Presidential Inaugural Speech after 1980 ######################################################################### ######################################################################### # apply wordscores by estimating both the row scores as well as the LBG method # reference scores: Reagan 1981: +1; Obama 2009: -1; Clinton 1993: 0 ######################################################################### ######################################################################### # Using wordscores: UK example with economic policy positions but without stemming ######################################################################### ######################################################################### # replicate the previous analysis to estimate the row scores with all parties, and compare the results # with what you get with stemming # Note: This is just a suggested homework!