rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest) 
library(caret)
library(naivebayes)
library(car)
library(cvTools)
library(reshape2)
library(dplyr)

#####################################################
# let's prepare the training-set with 3 categories (this script works fine for any number of categories>2)
#####################################################

uk_train <- read.csv("uk_train2022.csv")
str(uk_train)
myCorpusTwitterTrain <- corpus(uk_train)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE,
remove_url = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) 
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
topfeatures(Dfm_train , 20)  # 20 top words 

# Always checking if after trimming you have some texts with just 0s!
# 7 in this case
Dfm_train[ntoken(Dfm_train) == 0,]
Dfm_train  <-  Dfm_train[ntoken(Dfm_train) != 0,]
Dfm_train[ntoken(Dfm_train) == 0,]

# our classes
table(Dfm_train@docvars$sentiment)
# our benchmark: accuracy .57
prop.table(table(Dfm_train@docvars$sentiment))

train <- as.matrix(Dfm_train)

######################################################
######################################################
# which main changes? Compared to previous scripts?

######################################################
######################################################
# First during the CV stage:
# consider the case of a SVM - but that applies of course to all the other ML algorithms
######################################################
######################################################

# STEP 1: create the 5 folds. 

ttrain <- train 
nrow(ttrain)
set.seed(1234) # set the see for replicability
k <- 5
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

# STEP 2: the LOOP

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
  validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
  newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model 
# (just fit on the train data) and ADD the name of the output (in this case "sentiment")
  newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$sentiment)
print(class_table)  
  df<-confusionMatrix( class_table,  mode = "everything") 
  df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5)  
  assign(df.name,df)
})

# STEP 3: the metrics

SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) ##### FIRST CHANGE
# Why 4 columns NOW? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive.
# According to the number of classes in your output variable, changes the number of columns to fill!!!

for(i in  mget(ls(pattern = "conf.mat.sv")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
##### SECOND CHANGE: the following 4 lines; 
p <- as.data.frame((i)$byClass)
F1_negative <- p$F1[1] # save in the matrix the F1 value for negative
F1_neutral <- p$F1[2] # save in the matrix the F1 value for neutral
F1_positive  <- p$F1[3] # save in the matrix the F1 value for positive
SVMPredict <- rbind(SVMPredict , cbind(Accuracy , F1_negative , F1_neutral, F1_positive ))
 }
str(SVMPredict )

# you see that we are not doing that well with the class "negative"

# Let's compare the average value for accuracy and f1

acc_sv_avg <- mean(SVMPredict[, 1] )
f1_sv_avg <- mean(colMeans(SVMPredict[-1] ))

acc_sv_avg
f1_sv_avg

# you see here that we do not improve that much compared to our benchmark model.
# Moreover there is a wide gap between accuracy and the avg. value of F1: why?
# Cause we are doing relatively bad with the "negative" class

mean(colMeans(SVMPredict[2] ))
mean(colMeans(SVMPredict[3] ))
mean(colMeans(SVMPredict[4] ))

######################################################
######################################################
# Second: in terms of the global-interpretation of your results [but you can look through it by your own]
######################################################
######################################################

#####################################################
# Let's start with a Naive Bayes Model
#####################################################

train <- as.matrix(Dfm_train)

set.seed(123) 
system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$sentiment), laplace = 1))
summary(NB22)

NB_prob <- as.data.frame(NB22$params)
NB_prob$Feature <- row.names(NB_prob)
# let's identify the max value between negative, neutral and positive for each feature
NB_prob$winner <- apply(NB_prob[c(1:3)], 1, FUN=max)
str(NB_prob)
NB_prob$sentiment <- ifelse(NB_prob$winner == NB_prob$negative, "negative",
ifelse(NB_prob$winner == NB_prob$positive, "positive", "neutral"))
str(NB_prob)

negatives <- NB_prob[ which(NB_prob$sentiment =="negative"), ]
positives <- NB_prob[ which(NB_prob$sentiment =="positive"), ]
neutrals <- NB_prob[ which(NB_prob$sentiment =="neutral"), ]

# the features that change the most the difference between the positive and negative conditional probabilities
print(head(negatives [order(negatives $winner , decreasing=TRUE),], 15)) # negative words
print(head(positives [order(positives $winner , decreasing=TRUE),], 15)) # positive words
print(head(neutrals [order(neutrals $winner , decreasing=TRUE),], 15)) # neutral words

# let's extract the top 15-most positive values, the 15-most negative values and the 15-most neutral values with respect to contributing features
df1 <- top_n(negatives , 15, winner )
df2 <- top_n(positives , 15, winner)
df3 <- top_n(neutrals , 15, winner)
NB_prob_new <- rbind(df1, df2, df3)

# reorder the features
NB_prob_new <-  mutate(NB_prob_new, Feature= reorder(Feature, winner))

ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = TRUE) +
coord_flip() + theme(legend.position="bottom")+
ylab("Conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "positive versus neutral versus negative words - Naive Bayes Model")

# let's reorder the features but differentiating according to the category (positive, negative and neutral)
NB_prob_new$Feature <- with(NB_prob_new, factor(Feature,levels=Feature[order(ave(winner,
sentiment,FUN=max),winner)]))

ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = TRUE) +
coord_flip() + theme(legend.position="bottom")+
ylab("Conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "positive versus neutral versus negative words - Naive Bayes Model")

NB_graph <- ggplot(NB_prob_new, aes(Feature, winner, fill = sentiment)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = TRUE) +
coord_flip() + theme(legend.position="bottom")+
ylab("Conditional probabilities") +
scale_fill_manual(values = c("#000033", "#006699", "#99CCFF")) +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "positive versus neutral versus negative words - Naive Bayes Model")

#####################################################
# let's run a Random Forest
#####################################################

set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$sentiment), x=train, importance=TRUE,  do.trace=TRUE, ntree=500))
RF
str(RF$err.rate)
plot(RF, type = "l", col = c("red", "black","springgreen4", "blue"), main = "Class errors in Random Forest Model / OOB") 
legend("topright", horiz = F, cex = 0.7, fill = c( "red", "black", "springgreen4", "blue"), 
c( "OOB","negative", "neutral", "positive"))

head(RF$importance[,4:5])
varImpPlot(RF )

# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,4:5])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with 
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))

predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- ifelse(predicted_rf=="negative",0, ifelse(predicted_rf == "neutral",1,2))
table(Dfm_train@docvars$predRF)

# adding sign [if 0/1 according to the content of the review - neg or pos]
sums <- list()
for (v in 0:2){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,], na.rm = TRUE)
}

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    Feature = names, 
    sign = sign-1,
    stringsAsFactors=F)

str(df)

importance <- merge(importance_RF, df, by="Feature")
str(importance)
table(importance$sign)

for (v in 0:2){
    cat("\n\n")
    cat("value==", v)
    importance <- importance[order(importance$MeanDecreaseGini, decreasing=TRUE),]
    print(head(importance[importance$sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", "))
}

# let's draw a graph with our results!
str(importance)

negatives <- importance[ which(importance$sign ==0), ]
neutrals <- importance[ which(importance$sign ==1), ]
positives <- importance[ which(importance$sign ==2), ]

df1 <- top_n(negatives , 10, MeanDecreaseGini)
df2 <- top_n(positives , 10, MeanDecreaseGini)
df3 <- top_n(neutrals , 10, MeanDecreaseGini)

RF_prob_new <- rbind(df1, df2, df3)
str(RF_prob_new)
RF_prob_new$sign2 <- factor(RF_prob_new$sign, labels = c("Negative","Neutral","Positive"))
str(RF_prob_new)
table(RF_prob_new$sign)
table(RF_prob_new$sign2)

# reorder the features
RF_prob_new <-  mutate(RF_prob_new, Feature= reorder(Feature, MeanDecreaseGini ))

ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - Random Forest Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

# let's reorder the features but differentiating according to the category (positive, negative and neutral)
RF_prob_new$Feature <- with(RF_prob_new, factor(Feature,levels=Feature[order(ave(MeanDecreaseGini,
sign2,FUN=max),MeanDecreaseGini)]))

ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - Random Forest Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

RF_graph <- ggplot(RF_prob_new, aes(Feature, MeanDecreaseGini , fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - Random Forest Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

#####################################################
# Let's estimate a SVM
#####################################################

set.seed(123)# (define a set.seed for being able to replicate the results!)
system.time(SV <- svm(y= as.factor(Dfm_train@docvars$sentiment), x=train, kernel='linear', cost = 1))
length(SV$index) 

# permutation 
train2 <- convert(Dfm_train, to = "data.frame")
colnames(train2 ) <- make.names(colnames(train2 ), unique = TRUE)

# rerun the SVM with the data frame
set.seed(123)
tree2 <- svm(as.factor(Dfm_train@docvars$sentiment) ~ ., data = train2[-1], kernel='linear', cost = 1)

mod <- Predictor$new(tree2, data = train2[-1], y = as.factor(Dfm_train@docvars$sentiment), type = "prob")

system.time({
  plan("callr", workers = 6)
 set.seed(123)
imp2 <- FeatureImp$new(mod, loss = "ce", n.repetitions=1)
})
imp2
res <- imp2$results

colnames(train) <- make.names(colnames(train), unique = TRUE)

predicted_sv <- predict(tree2 , train, type="class")
table(predicted_sv )
Dfm_train@docvars$predSV <- ifelse(predicted_sv=="negative",0, ifelse(predicted_sv == "neutral",1,2))
table(Dfm_train@docvars$predSV)

# adding sign [if 0/2 according to the content of the review - neg (0) or pos (1) or neutral (2)]
sums <- list()
for (v in 0:2){
    sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predSV"]==v,], na.rm = TRUE)
}

sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)

# get the feature 
names <-  dimnames(train)[[2]]
str(names)

df <- data.frame(
    feature = names, 
    sign = sign-1,
    stringsAsFactors=F)

str(df)
str(res)

importanceSVM  <- merge(res, df, by="feature")
str(importanceSVM  )
names(importanceSVM  )[3] <- "importanceSVM"
str(importanceSVM  )

## best predictors
for (v in 0:2){
    cat("\n\n")
    cat("value==", v)
    importanceSVM  <- importanceSVM  [order(importanceSVM  $importanceSVM, decreasing=TRUE),]
    print(head(importanceSVM  [importanceSVM  $sign==v,], n=10))
    cat("\n")
    cat(paste(unique(head(importanceSVM  $features[importanceSVM  $sign==v], n=10)), collapse=", "))
}

negatives <- importanceSVM  [ which(importanceSVM  $sign ==0), ]
neutrals <- importanceSVM  [ which(importanceSVM  $sign ==1), ]
positives <- importanceSVM  [ which(importanceSVM  $sign ==2), ]

df1 <- top_n(negatives , 10, importanceSVM  )
df2 <- top_n(positives , 10, importanceSVM  )
df3 <- top_n(neutrals , 10, importanceSVM  )

SV_prob_new <- rbind(df1, df2, df3)
str(SV_prob_new)
SV_prob_new$sign2 <- factor(SV_prob_new$sign, labels = c("Negative","Neutral","Positive"))
str(SV_prob_new)
table(SV_prob_new$sign)
table(SV_prob_new$sign2)

# reorder the features
SV_prob_new <-  mutate(SV_prob_new, feature= reorder(feature, importanceSVM))

ggplot(SV_prob_new, aes(feature, importanceSVM, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Results") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - SVM Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

# let's reorder the features but differentiating according to the category (positive, negative and neutral)
SV_prob_new$feature <- with(SV_prob_new, factor(feature,levels=feature[order(ave(importanceSVM,
sign2,FUN=max),importanceSVM)]))

ggplot(SV_prob_new, aes(feature, importanceSVM, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Results") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - SVM Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

SVM_graph <- ggplot(SV_prob_new, aes(feature, importanceSVM, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic() +
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Permutation Results") +
labs(title = "Tweets about uk 2014 EP elections",
subtitle = "negative versus neutral versus positive features - SVM Model",
fill= "sentiment") + scale_fill_manual(values = c("#003333", "#009999", "steelblue")) 

#################################################
# let's compare feature importance across models
#################################################

library(gridExtra)
grid.arrange(NB_graph, RF_graph, SVM_graph, nrow=2) # Plot everything together

# let's check some correlation

str(NB_prob)
str(importance)
str(importanceSVM )
names(importanceSVM )[1] <- "Feature"
importanceSVM$Feature <- gsub("X.", "@", importanceSVM$Feature)
str(importanceSVM )

# let's merge the dataframe 
words <- merge(NB_prob, importance, by="Feature")
words <- merge(words , importanceSVM , by="Feature")
str(words)

library(PerformanceAnalytics)
chart.Correlation(words[c(5,7:8,11)])