rm(list=ls(all=TRUE)) getwd() # set here the folder where you have saved the datasets you are going to use in this class # setwd("*****") # in my case setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) ######################################################################### ######################################################################### # Using wordscores: UK example with economic policy positions but without stemming ######################################################################### ######################################################################### # replicate the previous analysis to estimate the row scores with all parties, and compare the results # with what you get with stemming myText <- readtext("*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Party", "Year")) str(myText) testCorpus <- corpus(myText) summary(testCorpus) # I want to name the texts in my just created corpus in a more proper way! docnames(testCorpus) <- paste(myText$Party, myText$Year, sep = " ") summary(testCorpus) # Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice myDfm2 <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm2 , 20) # 20 top words refscores <- c(17.21, NA, 5.35, NA, 8.21, NA) refscores ws2 <- textmodel_wordscores(myDfm2, refscores) summary(ws2) pr_all2 <- predict(ws2, interval = "confidence") pr_all2 # compare with the results you get with stemming myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) ws <- textmodel_wordscores(myDfm, c(17.21, NA, 5.35, NA, 8.21, NA)) summary(ws) pr_all <- predict(ws, interval = "confidence") pr_all # more comparisons! ch <- as.data.frame(pr_all$fit) ch$Party <- rownames(ch) str(ch) ch2 <- as.data.frame(pr_all2$fit) str(ch2) ch$fit2 <- ch2$fit str(ch) cor(ch$fit, ch$fit2) plot(ch$fit, ch$fit2 , main="Scatterplot Example", xlab="Stem", ylab="No stem", pch=19) text(ch$fit, ch$fit2 , labels = ch$Party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(ch$fit2 ~ ch$fit ), col="red") # regression line (y~x) ######################################################################### ######################################################################### # Using wordscores: UK example with social policy positions ######################################################################### ######################################################################### # reference texts: 1992 parties manifestos # reference texts scores: 1992 parties manifestos. Lab: 6.87; LibDem: 6.53; Cons: 15.34 # Run the analysis (with stemming!) by focusing on the row scores # Set reference scores refscores <- c(15.34, NA, 6.87, NA, 6.53, NA) refscores ws <- textmodel_wordscores(myDfm, refscores) summary(ws) pr_all <- predict(ws, interval = "confidence") pr_all # Plot estimated document positions and group by "party" or "year" variable [row scores] summary(testCorpus) doclab <- apply(docvars(testCorpus, c("Party", "Year")), 1, paste, collapse = " ") doclab # Get predictions pred <- predict(ws, se.fit = TRUE) pred textplot_scale1d(pred, margin = "documents", doclabels = doclab) textplot_scale1d(pred, margin = "documents", doclabels = doclab, groups = docvars(testCorpus, "Party")) textplot_scale1d(pred, margin = "documents", doclabels = doclab, groups = docvars(testCorpus, "Year")) ######################################################################### ######################################################################### # Using wordscores: US Presidential Inaugural Speech after 1980 ######################################################################### ######################################################################### # apply wordscores by estimating both the row scores as well as the LBG method # reference scores: Reagan 1981: +1; Obama 2009: -1; Clinton 1993: 0 # create a dfm from inaugural addresses from Reagan onwards presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove = stopwords("english"), stem = TRUE, remove_punct = TRUE) presDfm [, 1:5] ws_pres <- textmodel_wordscores(presDfm, c(1, NA, NA, 0, NA, NA, NA, -1, NA, NA)) summary(ws_pres) # alternatively ws_pres2 <- textmodel_wordscores(presDfm, c(1, rep(NA, 2), 0, rep(NA, 3), -1, rep(NA, 2))) summary(ws_pres2) # scaling all texts (including the reference ones) pr_all_pres <- predict(ws_pres, interval = "confidence" ) pr_all_pres # Plot estimated document positions summary(data_corpus_inaugural) doclab <- docnames(corpus_subset(data_corpus_inaugural, Year > 1980)) doclab pred <- predict(ws_pres, se.fit = TRUE) pred textplot_scale1d(pred, margin = "documents", doclabels = doclab) # using the lbg method pr_all_pres_lbg <- predict(ws_pres, rescaling = "lbg", newdata = presDfm[c(2:3, 5:7, 9:10), ], interval = "confidence") pr_all_pres_lbg rescaled <- as.data.frame(pr_all_pres_lbg$fit) rescaled$Party <- rownames(rescaled) str(rescaled ) # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step <-order(rescaled$fit ) rescaled<- rescaled[step, ] str(head(rescaled)) # Plotting the graph title <- "Presidents positions with 95% Confidence Intervals" dotchart(rescaled$fit, labels=rescaled$Party , col="blue", xlim=c(floor(min(rescaled$lwr)), ceiling(max(rescaled$upr))), main=title ) for (i in 1:nrow(rescaled)){ lines(x=c(rescaled$lwr[i],rescaled$upr[i]), y=c(i,i)) } # if you want to add to the previous graph also the reference-scores: ws_pres <- textmodel_wordscores(presDfm, refscores) str(ws_pres) fit <- ws_pres $y Party <- ws_pres $x@Dimnames$docs original_scores <-data.frame(Party , fit) original_scores$Party <- as.character(original_scores$Party ) str(original_scores) original_scores <- na.omit(original_scores) str(original_scores) str(rescaled) original_scores$lwr <- 0 original_scores$upr <- 0 str(original_scores) mydata <- rbind(rescaled, original_scores) str(mydata) # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step <-order(mydata $fit ) mydata <- mydata [step, ] str(head(mydata )) # Plotting the graph title <- "Presidents positions with 95% Confidence Intervals" dotchart(mydata $fit, labels=mydata $Party , col="blue", xlim=c(floor(min(mydata $lwr)), ceiling(max(mydata $upr))), main=title ) for (i in 1:nrow(mydata )){ lines(x=c(mydata $lwr[i],mydata $upr[i]), y=c(i,i)) }