rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(manifestoR)
library(quanteda)
library(PerformanceAnalytics)

mp_setapikey(key.file = NULL, key = "XXXXXXXXXX")

#########################################################################
# Assingment with South Africa 2014
#########################################################################

my_corpus <- mp_corpus(countryname == "South Africa" & date == 201405)
summary(my_corpus)
quanteda_sa <- corpus(my_corpus)
summary(head(quanteda_sa))
# making the DFM (WITHOUT stem!)
dfm_sa <- dfm(quanteda_sa, stem = FALSE, tolower=TRUE, remove_punct = TRUE, remove = stopwords("english"),  remove_numbers=TRUE)
dfm_sa[1:10, 1:5]
dfm_sa2 <- dfm_group(dfm_sa, "manifesto_id") 
dfm_sa2[1:5, 1:5]

# give me back the 5 most frequent word for each party
feature_frequencies_categories <- textstat_frequency(dfm_sa2, n = 5, group = "manifesto_id") 
str(feature_frequencies_categories) 

# now I want to plot such frequencies using the original SA party name included in the CMP core dataset...
mpds <- mp_maindataset()
south_africa<- mpds[ which(mpds$countryname=="South Africa"  & mpds$date == 201405),]
print(south_africa[c("partyname", "party", "edate", "date", "partyabbrev")])
south_africa$partyname

feature_frequencies_categories <- mutate(feature_frequencies_categories, manifesto_id = factor(group, labels = south_africa$partyname)) 
str(feature_frequencies_categories) 

library(ggplot2)

ggplot(feature_frequencies_categories,aes(x = reorder(feature_frequencies_categories$feature, feature_frequencies_categories$frequency) , y = feature_frequencies_categories$frequency, 
fill = feature_frequencies_categories$manifesto_id)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "share of words per category") +
  facet_wrap(~feature_frequencies_categories$manifesto_id, ncol = 2, scales = "free") +
  coord_flip()

#########################################################################
# Using wordfish 
#########################################################################

# making the DFM (WITH stem!)
dfm_sa <- dfm(quanteda_sa, stem = TRUE, tolower=TRUE, remove_punct = TRUE, remove = stopwords("english"),  remove_numbers=TRUE)
dfm_sa[1:10, 1:5]
dfm_sa2 <- dfm_group(dfm_sa, "manifesto_id") 
dfm_sa2[1:5, 1:5]

wfm <- textmodel_wordfish(dfm_sa2, dir = c(1, 3))
summary(wfm)
str(wfm)
wfm$docs <- south_africa$partyname
wfm$docs 

# Plot estimated word positions
textplot_scale1d(wfm, margin = "features")

# Plot estimated document positions
textplot_scale1d(wfm, margin = "documents")

#########################################################################
# Using wordscores 
#########################################################################

south_africa$partyname
# on left-right:  ANC=5.2; DA=10.5
refscores <- c(NA, 5.2, 10.5, NA, NA)
refscores 
ws <- textmodel_wordscores(dfm_sa2, refscores)
summary(ws)

# scaling all texts (including the reference ones) 
pr_all <- predict(ws, se.fit = TRUE)
pr_all

# check for the correlation
party <- wfm$docs
score_wf <-wfm$theta
score_ws <- pr_all$fit
scores_texts <-data.frame(party, score_wf, score_ws)
str(scores_texts) 
cor(scores_texts$score_ws, scores_texts$score_wf)

# you can also draw a scatter, with a fit lines and party names
plot(scores_texts$score_ws, scores_texts$score_wf, main="Scatterplot", 
  	xlab="Wordscores", ylab="Wordfish", pch=19)
text(scores_texts$score_ws, scores_texts$score_wf, labels = scores_texts$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts$score_wf ~scores_texts$score_ws ), col="red") # regression line (y~x) 

#########################################################################
# Compare wordfish & wordscores with RILE
#########################################################################

rile <- south_africa$rile
scores_texts2 <-data.frame(party, score_wf, score_ws, rile)
str(scores_texts2) 

attach(scores_texts2)
set_lr <- cbind(rile,score_wf, score_ws)
chart.Correlation(set_lr)

# Add fit lines and party names (rile vs. wordfish)
plot(scores_texts2$rile, scores_texts2$score_wf, main="Scatterplot Example", 
  	xlab="RILE", ylab="Wordfish  ", pch=19)
text(scores_texts2$rile,scores_texts2$score_wf , labels = scores_texts2$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts2$score_wf ~scores_texts2$rile ), col="red") # regression line (y~x)

# Add fit lines and party names (rile vs. wordscores)
plot(scores_texts2$rile, scores_texts2$score_ws, main="Scatterplot Example", 
  	xlab="RILE", ylab="Wordscores  ", pch=19)
text(scores_texts2$rile,scores_texts2$score_ws , labels = scores_texts2$party, pos = 4,  col = "royalblue" , cex = 0.8)
abline(lm(scores_texts2$score_ws ~scores_texts2$rile ), col="red") # regression line (y~x)