rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/Lecture 2/Wordscores manifestos/IE") getwd() library(readtext) library(quanteda) library(Hmisc) library(cowplot) ######################################################################### ######################################################################### # Creating and Working with a Corpus ######################################################################### ######################################################################### myText <- readtext("*.txt", docvarsfrom = "filenames", dvsep = " ", docvarnames = c("Year", "Party")) str(myText) testCorpus <- corpus(myText) summary(testCorpus) docnames(testCorpus) <- paste(myText$Party, myText$Year, sep = " ") summary(testCorpus) # Comparing the results with and w/o stopwords, with and w/o stemming is always a good practice myDfm <- dfm(testCorpus , remove = stopwords("english"), tolower = TRUE, stem = TRUE, remove_punct = TRUE, remove_numbers=TRUE) topfeatures(myDfm , 20) # 20 top words # alternative way to set reference scores ws <- textmodel_wordscores(myDfm, c(4.5, 13.13, 15, 6.88, 17.63, rep(NA, 5))) summary(ws) # scaling all texts (including the reference ones) pr_all <- predict(ws, se.fit = TRUE) pr_all doclab <- docnames(testCorpus) doclab # wordfish # here: DL 92 to the left of FG 92 summary(testCorpus) wfm <- textmodel_wordfish(myDfm, dir = c(1, 3)) summary(wfm) # Plot estimated word positions textplot_scale1d(wfm, margin = "features") textplot_scale1d(wfm, margin = "documents") # Comparing wordscores vs wordfish # library(cowplot) wordscores <- textplot_scale1d(pr_all, margin = "documents", doclabels = doclab) wordfish <- textplot_scale1d(wfm, margin = "documents") plot_grid(wordscores , wordfish , labels = c('Wordscores', 'Wordfish')) # check for the correlation party <- wfm$docs score_wf <-wfm$theta score_ws <- pr_all$fit scores_texts <-data.frame(party, score_wf, score_ws) str(scores_texts) cor(scores_texts$score_ws, scores_texts$score_wf) # you can also draw a scatter, with a fit lines and party names plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", xlab="Wordscores", ylab="Wordfish", pch=19) text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) # https://en.wikipedia.org/wiki/24th_Government_of_Ireland # hint: government situation before 1992 elections (FF); before 1997 elections (rainbow coalition between FG, Labour and DL, # before that, FF cabinet) # rescale using the LBG transformation pr_lbg <- predict(ws, rescaling = "lbg", newdata = myDfm[c(6:10), ], interval = "confidence") pr_lbg rescaled <- as.data.frame(pr_lbg$fit) rescaled$party <- rownames(rescaled) str(rescaled ) ################################ # Worfish and Wordscores only 1997! comparison ################################ ch <- merge(rescaled, scores_texts , by=c("party")) str(ch) cor(ch$score_wf, ch$fit) # you can also draw a scatter, with a fit lines and party names plot(ch$fit, ch$score_wf, main="Scatterplot", xlab="Wordscores", ylab="Wordfish", pch=19) text(ch$fit, ch$score_wf, labels = ch$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(ch$score_wf ~ch$fit ), col="red") # regression line (y~x)