rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/UK") getwd() library(readtext) library(quanteda) ######################################################################### ######################################################################### # The Irish case ######################################################################### ######################################################################### myText <- readtext("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/IE/*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Year", "Party")) str(myText) testCorpus <- corpus(myText) summary(testCorpus) # Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words # set reference scores ws <- textmodel_wordscores(myDfm, c(4.5, 13.13, 15, 6.88, 17.63, rep(NA, 5))) summary(ws) # rescale using the LBG transformation pr <- predict(ws, rescaling = "lbg", newdata = myDfm[c(6:10), ], interval = "confidence") pr # scaling all texts (including the reference ones) pr_all <- predict(ws, newdata = myDfm) pr_all # Plot estimated document positions # Plot estimated document positions and group by "party" or "year" variable [row scores: this is easy!] summary(testCorpus) doclab <- docnames(testCorpus) doclab # Get predictions pred <- predict(ws, se.fit = TRUE) pred textplot_scale1d(pred, margin = "documents", doclabels = doclab) textplot_scale1d(pred, margin = "documents", doclabels = doclab, groups = docvars(testCorpus, "Party")) textplot_scale1d(pred, margin = "documents", doclabels = doclab, groups = docvars(testCorpus, "Year")) # Plot estimated document positions [transformed scores - LBG] pr2 <- predict(ws, rescaling = "lbg", newdata = myDfm[c(6:10), ], interval = "confidence") pr2 str(pr2) pr2$fit # first way of plotting a graph rescaled <- as.data.frame(pr2$fit) rescaled$Party <- rownames(rescaled) str(rescaled ) # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step <-order(rescaled$fit ) rescaled<- rescaled[step, ] str(head(rescaled)) # Plotting the graph title <- "Parties positions with 95% Confidence Intervals" dotchart(rescaled$fit, labels=rescaled$Party , col="blue", xlim=c(floor(min(rescaled$lwr)), ceiling(max(rescaled$upr))), main=title ) for (i in 1:nrow(rescaled)){ lines(x=c(rescaled$lwr[i],rescaled$upr[i]), y=c(i,i)) } # if you want to add to the previous graph also the reference-scores: ws <- textmodel_wordscores(myDfm) str(ws) fit <- ws$y Party <- ws$x@Dimnames$docs original_scores <-data.frame(Party , fit) original_scores$Party <- as.character(original_scores$Party ) str(original_scores) original_scores <- na.omit(original_scores) str(original_scores) str(rescaled) original_scores$lwr <- 0 original_scores$upr <- 0 str(original_scores) mydata <- rbind(rescaled, original_scores) str(mydata) # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step <-order(mydata $fit ) mydata <- mydata [step, ] str(head(mydata )) # Plotting the graph title <- "Parties positions with 95% Confidence Intervals" dotchart(mydata $fit, labels=mydata $Party , col="blue", xlim=c(floor(min(mydata $lwr)), ceiling(max(mydata $upr))), main=title ) for (i in 1:nrow(mydata )){ lines(x=c(mydata $lwr[i],mydata $upr[i]), y=c(i,i)) } # second way of plotting a graph str(mydata) mydata$ci <- mydata$fit-mydata$lwr str(mydata) mydata$ci[mydata$lwr==0] <- 0 str(head(mydata )) library(ggplot2) ggplot(mydata, aes(x=reorder (Party, fit), y=fit, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") + theme_classic() # comparing raw with transformed scores library(cowplot) x <- textplot_scale1d(pred, margin = "documents", doclabels = doclab) y <- ggplot(mydata, aes(x=reorder (Party, fit), y=fit, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=fit-ci, ymax=fit+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") + theme_classic() plot_grid(x , y, labels = c('Raw', 'Transformed') ) ######################################################################### ######################################################################### # Using wordscores: US Presidential Inaugural Speech after 1980 ######################################################################### ######################################################################### # apply wordscores by estimating both the row scores as well as the LBG method # reference scores: Reagan 1981: +1; Obama 2009: -1; Clinton 1993: 0 summary(corpus_subset(data_corpus_inaugural, Year > 1980)) # create a dfm from inaugural addresses from Reagan onwards presDfm <- dfm(corpus_subset(data_corpus_inaugural, Year > 1980), remove = stopwords("english"), stem = TRUE, remove_punct = TRUE) presDfm [, 1:5] ws_pres <- textmodel_wordscores(presDfm, c(1, NA, NA, 0, NA, NA, NA, -1, NA, NA)) summary(ws_pres) ################################ # scaling all texts (including the reference ones) ################################ pr_all_pres <- predict(ws_pres, interval = "confidence" ) pr_all_pres # Plot estimated document positions summary(data_corpus_inaugural) doclab <- docnames(corpus_subset(data_corpus_inaugural, Year > 1980)) doclab pred <- predict(ws_pres, se.fit = TRUE) pred textplot_scale1d(pred, margin = "documents", doclabels = doclab) ################################ # Trump 2017: +1; Obama 2009: -1; Clinton 1993: 0 ################################ ws_pres2 <- textmodel_wordscores(presDfm, c(rep(NA, 3), 0, rep(NA, 3), -1, rep(NA, 1), 1)) summary(ws_pres2) # scaling all texts (including the reference ones) pr_all_pres2 <- predict(ws_pres2, interval = "confidence" ) pr_all_pres2 pred2 <- predict(ws_pres2, se.fit = TRUE) pred2 textplot_scale1d(pred2, margin = "documents", doclabels = doclab) ################################ # comparison! ################################ library(cowplot) x <-textplot_scale1d(pred, margin = "documents", doclabels = doclab) y <-textplot_scale1d(pred2, margin = "documents", doclabels = doclab) plot_grid(x , y, labels = c('Reagan1981:+1', 'Trump2017:+1') ) ch <- as.data.frame(pr_all_pres$fit) ch$Party <- rownames(ch) str(ch) ch2 <- as.data.frame(pr_all_pres2$fit) str(ch2) ch$fit2 <- ch2$fit str(ch) cor(ch$fit, ch$fit2) plot(ch$fit, ch$fit2 , main="Scatterplot Example", xlab="Reagan81=1 ", ylab="Trump2017=1", pch=19) text(ch$fit, ch$fit2 , labels = ch$Party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(ch$fit2 ~ ch$fit ), col="red") # regression line (y~x) ################################ # using the LBG method ################################ pr_all_pres_lbg <- predict(ws_pres, rescaling = "lbg", newdata = presDfm[c(2:3, 5:7, 9:10), ], interval = "confidence") pr_all_pres_lbg rescaled <- as.data.frame(pr_all_pres_lbg$fit) rescaled$Party <- rownames(rescaled) str(rescaled ) # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step <-order(rescaled$fit ) rescaled<- rescaled[step, ] str(head(rescaled)) # Plotting the graph title <- "Presidents positions with 95% Confidence Intervals" dotchart(rescaled$fit, labels=rescaled$Party , col="blue", xlim=c(floor(min(rescaled$lwr)), ceiling(max(rescaled$upr))), main=title ) for (i in 1:nrow(rescaled)){ lines(x=c(rescaled$lwr[i],rescaled$upr[i]), y=c(i,i)) } # if you want to add to the previous graph also the reference-scores: ws_pres <- textmodel_wordscores(presDfm, c(1, NA, NA, 0, NA, NA, NA, -1, NA, NA)) summary(ws_pres) fit <- ws_pres $y Party <- ws_pres $x@Dimnames$docs original_scores <-data.frame(Party , fit) original_scores$Party <- as.character(original_scores$Party ) str(original_scores) original_scores <- na.omit(original_scores) str(original_scores) str(rescaled) original_scores$lwr <- 0 original_scores$upr <- 0 str(original_scores) mydata <- rbind(rescaled, original_scores) str(mydata) # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step <-order(mydata $fit ) mydata <- mydata [step, ] str(head(mydata )) # Plotting the graph title <- "Presidents positions with 95% Confidence Intervals" dotchart(mydata $fit, labels=mydata $Party , col="blue", xlim=c(floor(min(mydata $lwr)), ceiling(max(mydata $upr))), main=title ) for (i in 1:nrow(mydata )){ lines(x=c(mydata $lwr[i],mydata $upr[i]), y=c(i,i)) } ################################ # Trump 2017: +1; Obama 2009: -1; Clinton 1993: 0 ################################ summary(corpus_subset(data_corpus_inaugural, Year > 1980)) pr_all_pres_lbg2 <- predict(ws_pres2, rescaling = "lbg", newdata = presDfm[c(1:3, 5:7, 9), ], interval = "confidence") pr_all_pres_lbg2 rescaled2 <- as.data.frame(pr_all_pres_lbg2$fit) rescaled2$Party <- rownames(rescaled2) str(rescaled2 ) # ordering the parties in a ascending order (otherwise: step <-order(-scores_texts$score) step2 <-order(rescaled2$fit ) rescaled2<- rescaled2[step2, ] str(head(rescaled2)) # Plotting the graph title <- "Presidents positions with 95% Confidence Intervals" dotchart(rescaled2$fit, labels=rescaled2$Party , col="blue", xlim=c(floor(min(rescaled2$lwr)), ceiling(max(rescaled2$upr))), main=title ) for (i in 1:nrow(rescaled2)){ lines(x=c(rescaled2$lwr[i],rescaled2$upr[i]), y=c(i,i)) } ################################ # comparisons! ################################ str(rescaled ) str(rescaled2 ) colnames(rescaled2 )[1] <- "fit2" str(rescaled2 ) ch <- merge(rescaled , rescaled2 , by=c("Party")) edit(ch ) str(ch ) cor(ch$fit, ch$fit2) plot(ch$fit, ch$fit2 , main="Scatterplot Example", xlab="Reagan81=1 ", ylab="Trump2017=1", pch=19) text(ch$fit, ch$fit2 , labels = ch$Party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(ch$fit2 ~ ch$fit ), col="red") # regression line (y~x)