rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/luigi/Dropbox/TOPIC MODEL/") library(readtext) library(quanteda) library(wordshoal) library(ggplot2) library(plotrix) ########################### ########################### # WORDSHOAL ########################### ########################### # Irish Dail speeches from 2007-2011 head(docvars(data_corpus_irish30)) tok2 <- tokens(data_corpus_irish30, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) iedfm<- dfm(tok2 ) topfeatures(iedfm, 20) # 20 top words # 72 MPs Members... length(unique(iedfm@docvars$memberID)) length(unique(iedfm@docvars$member.name)) # ...that speak over 10 debates at least once length(unique(iedfm@docvars$debateID)) # overall the 72 Members speak 127 times - some MP speaks in more than 1 speech length(iedfm@docvars$debateID) # take a look at the matrix x <- iedfm@docvars fix(x) # let's run wordshoal by identifying the polarity as in a usual wordfish (here Costello to the left of Ahern) # "groups" refers to the debates; "authors" refers to the single MP unique(iedfm@docvars$member.name) wordshoalfit <- textmodel_wordshoal(iedfm, dir = c(7,1), groups = docvars(data_corpus_irish30, "debateID"), authors = docvars(data_corpus_irish30, "member.name")) summary(wordshoalfit) # the overall theta for each MPs along the single overall latent dimension # Let's see the columns we get out of the textmodel_wordshoal command str(wordshoalfit) # theta refers to document positions [in this case Members position across debates]; # beta refers to debate marginal effects [the higher the absolute value of beta, the more a debate is associate with # the overall scale] you can use these values (and its positive/negative sign) # to understand which debates are most related to the latent dimension extracted # Let's see and example: debateID <- unique(wordshoalfit$groups) beta <- wordshoalfit$beta matrix <- as.data.frame(cbind(debateID , beta)) # as you can see, debate 1 and 6 are negatively correlated with the thetas, while debate 7 and 10 are positively correlated matrix [order(matrix $beta),] # alpha refers to group/debate fixed effects; # psi refers to author group-level positions [position of legislator i in debate j] # note that "psi*beta" = speech position of legislator i on debate j (calibrated to the general scale) # let's plot the positions of Authors along the single latent dimension (w/o and with the c.i.) scores <- wordshoalfit$theta se <- wordshoalfit$se.theta ci <- se*1.96 mp <- as.character(unique(wordshoalfit$authors)) ch <-as.data.frame(cbind(mp, scores, ci)) str(ch) ch$ scores<- as.numeric(paste(ch$ scores)) ch$ci <- as.numeric(paste(ch$ci)) str(ch) ggplot(ch, aes(x=reorder (mp, scores), y=scores, group=1)) + geom_point(aes()) + coord_flip() + xlab("Position") + ylab("MP") ggplot(ch, aes(x=reorder (mp, scores), y=scores, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=scores-ci, ymax=scores+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("MP") # let's extract the thetas author_positions <- summary(wordshoalfit)$estimated.author.positions author_positions$row_names <- rownames(author_positions) str(author_positions) # let's merge with the metadata present in the original corpus head(docvars(data_corpus_irish30)) fitdf <- merge(author_positions, docvars(data_corpus_irish30), by.x = "row_names", by.y = "member.name") str(fitdf) # let's estimate the average position of each party along theta and let's plot it x <- as.matrix(aggregate(theta ~ party.name, data = fitdf, FUN = function(x) c(mean = mean(x), se = std.error(x)))) x2 <- as.data.frame(x) str(x2) x2$theta.mean<- as.numeric(x2$theta.mean) x2$theta.se <- as.numeric(x2$theta.se) str(x2) x2$ci <- x2$theta.se*1.96 str(x2) ggplot(x2, aes(x=reorder (party.name, theta.mean), y=theta.mean, group=1)) + geom_point(aes()) + coord_flip() + xlab("Position") + ylab("Party") ggplot(x2, aes(x=reorder (party.name, theta.mean), y=theta.mean, group=1)) + geom_point(aes()) + geom_errorbar(width=.1, aes(ymin=theta.mean-ci, ymax=theta.mean+ci), colour="darkred") + coord_flip() + xlab("Position") + ylab("Party") # let's extract also the psi and let's plot it document_id <- as.numeric(as.character(wordshoalfit$groups)) document_id # 127 repeated speeches debateID <- unique(document_id ) # 10 debates debateID document_beta <- as.data.frame(cbind(wordshoalfit$beta,debateID)) str(document_beta) names(document_beta)[1] <- "beta" str(document_beta) fitdf2 <- merge(fitdf, document_beta, by = "debateID") str(fitdf2) # note that "psi*beta" = speech position of legislator i on debate j (calibrated to the general scale) fitdf2$psi <- wordshoalfit$psi str(fitdf2) fitdf2$mp_debate_pos <- fitdf2$psi*fitdf2$beta # rescaling values to compare with wordfish str(fitdf2) fitdf2$debateIDstr <- as.character(fitdf2$debateID) str(fitdf2) fitdf2$MPdebate <- paste(fitdf2$row_names, fitdf2$debateIDstr, sep="") str(fitdf2) ggplot(fitdf2, aes(x=reorder (MPdebate , mp_debate_pos), y=mp_debate_pos, group=1)) + geom_point(aes()) + coord_flip() + xlab("Position") + ylab("MPdebate") # too messy! so let's focus only on "Mr. Fergus O'Dowd" newdata <- fitdf2[ which(fitdf2$row_names=="Mr. Fergus O'Dowd"), ] str(newdata) # we can see the change across debates of the position of "Mr. Fergus O'Dowd" ggplot(newdata, aes(x=reorder (MPdebate , mp_debate_pos), y=mp_debate_pos, group=1)) + geom_point(aes()) + coord_flip() + xlab("Position") + ylab("MPdebate")