rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(manifestoR) library(quanteda) library(PerformanceAnalytics) mp_setapikey(key.file = NULL, key = "XXXXXXXXXX") ######################################################################### # Assingment with South Africa 2014 ######################################################################### my_corpus <- mp_corpus(countryname == "South Africa" & date == 201405) summary(my_corpus) quanteda_sa <- corpus(my_corpus) summary(head(quanteda_sa)) # making the DFM (WITHOUT stem!) dfm_sa <- dfm(quanteda_sa, stem = FALSE, tolower=TRUE, remove_punct = TRUE, remove = stopwords("english"), remove_numbers=TRUE) dfm_sa[1:10, 1:5] dfm_sa2 <- dfm_group(dfm_sa, "manifesto_id") dfm_sa2[1:5, 1:5] # give me back the 5 most frequent word for each party feature_frequencies_categories <- textstat_frequency(dfm_sa2, n = 5, group = "manifesto_id") str(feature_frequencies_categories) # now I want to plot such frequencies using the original SA party name included in the CMP core dataset... mpds <- mp_maindataset() south_africa<- mpds[ which(mpds$countryname=="South Africa" & mpds$date == 201405),] print(south_africa[c("partyname", "party", "edate", "date", "partyabbrev")]) south_africa$partyname feature_frequencies_categories <- mutate(feature_frequencies_categories, manifesto_id = factor(group, labels = south_africa$partyname)) str(feature_frequencies_categories) library(ggplot2) ggplot(feature_frequencies_categories,aes(x = reorder(feature_frequencies_categories$feature, feature_frequencies_categories$frequency) , y = feature_frequencies_categories$frequency, fill = feature_frequencies_categories$manifesto_id)) + geom_col(show.legend = FALSE) + labs(x = NULL, y = "share of words per category") + facet_wrap(~feature_frequencies_categories$manifesto_id, ncol = 2, scales = "free") + coord_flip() ######################################################################### # Using wordfish ######################################################################### # making the DFM (WITH stem!) dfm_sa <- dfm(quanteda_sa, stem = TRUE, tolower=TRUE, remove_punct = TRUE, remove = stopwords("english"), remove_numbers=TRUE) dfm_sa[1:10, 1:5] dfm_sa2 <- dfm_group(dfm_sa, "manifesto_id") dfm_sa2[1:5, 1:5] wfm <- textmodel_wordfish(dfm_sa2, dir = c(1, 3)) summary(wfm) str(wfm) wfm$docs <- south_africa$partyname wfm$docs # Plot estimated word positions textplot_scale1d(wfm, margin = "features") # Plot estimated document positions textplot_scale1d(wfm, margin = "documents") ######################################################################### # Using wordscores ######################################################################### south_africa$partyname # on left-right: ANC=5.2; DA=10.5 refscores <- c(NA, 5.2, 10.5, NA, NA) refscores ws <- textmodel_wordscores(dfm_sa2, refscores) summary(ws) # scaling all texts (including the reference ones) pr_all <- predict(ws, se.fit = TRUE) pr_all # check for the correlation party <- wfm$docs score_wf <-wfm$theta score_ws <- pr_all$fit scores_texts <-data.frame(party, score_wf, score_ws) str(scores_texts) cor(scores_texts$score_ws, scores_texts$score_wf) # you can also draw a scatter, with a fit lines and party names plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", xlab="Wordscores", ylab="Wordfish", pch=19) text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) ######################################################################### # Compare wordfish & wordscores with RILE ######################################################################### rile <- south_africa$rile scores_texts2 <-data.frame(party, score_wf, score_ws, rile) str(scores_texts2) attach(scores_texts2) set_lr <- cbind(rile,score_wf, score_ws) chart.Correlation(set_lr) # Add fit lines and party names (rile vs. wordfish) plot(scores_texts2$rile, scores_texts2$score_wf, main="Scatterplot Example", xlab="RILE", ylab="Wordfish ", pch=19) text(scores_texts2$rile,scores_texts2$score_wf , labels = scores_texts2$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts2$score_wf ~scores_texts2$rile ), col="red") # regression line (y~x) # Add fit lines and party names (rile vs. wordscores) plot(scores_texts2$rile, scores_texts2$score_ws, main="Scatterplot Example", xlab="RILE", ylab="Wordscores ", pch=19) text(scores_texts2$rile,scores_texts2$score_ws , labels = scores_texts2$party, pos = 4, col = "royalblue" , cex = 0.8) abline(lm(scores_texts2$score_ws ~scores_texts2$rile ), col="red") # regression line (y~x)