rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(reshape2)
library(gridExtra)
library(cvTools)

#####################################################
# FIRST STEP: let's prepare the training-set
#####################################################

# let's focus on MOVIE reviews (our DV: either a positive or a negative movie-review)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, 
remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)

Dfm_train <- dfm( tok2)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20)  # 20 top words

# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .524
prop.table(table(Dfm_train@docvars$Sentiment))

train <- as.matrix(Dfm_train) 

######################################################
######################################################
# Let's make CROSS-VALIDATION with k-fold=5 with a loop!
# HERE you HAVE JUST 2 classes for the outcome (negative/positive)
######################################################
######################################################

# Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command.
# But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret

# STEP 1: create the 5 folds
ttrain <- train # let's change the name of the original train matrix, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds 
set.seed(123) # set the see for replicability
k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work!
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

######################################################
######################################################
# Let's start with SVM
######################################################
######################################################

# STEP 2: the LOOP

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
  validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
  newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model 
# (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
  newpred <- predict(newrf,newdata=validation) # Get the predictions for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table,  mode = "everything") 
  df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5)  
  assign(df.name,df)
})

# STEP 3: the metrics
ls()
# we have created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results
# for example:
conf.mat.sv1
str(conf.mat.sv1)
conf.mat.sv1$overall[1] # overall accuracy: (36+47)/(36+47+8+9)
# note that the F1 value you see is not the one for the overall model, but just for the first class (i.e., "negative")
# Therefore we have to estimate the average value of F1 for each k-fold by hands! See below

conf.mat.sv1$byClass[1] # Recall for negative: (36)/(36+9) - think vertically!
conf.mat.sv1$byClass[3] # Precision for negative: (36)/(36+8) - think horizontally!
conf.mat.sv1$byClass[2] # Recall for positive: (47)/(47+8) - think vertically!
conf.mat.sv1$byClass[4] # Precision for positive: (47)/(47+9) - think horizontally!
# F1 per the negative class: 
conf.mat.sv1$byClass[7]
# that is:
(2*conf.mat.sv1$byClass[3]*conf.mat.sv1$byClass[1])/(conf.mat.sv1$byClass[3]+conf.mat.sv1$byClass[1]) # F1 for negative
# and for positive?
(2*conf.mat.sv1$byClass[2]*conf.mat.sv1$byClass[4])/(conf.mat.sv1$byClass[2]+conf.mat.sv1$byClass[4]) # F1 for positive

SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions; 
# why 3 columns? 1 for accuracy; and 2 for the K1 value of the classes in the Sentiment (given you have just two classes!)

for(i in  mget(ls(pattern = "conf.mat.sv")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
F1_negative<- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative 
F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive
SVMPredict <- rbind(SVMPredict , cbind(Accuracy , F1_negative, F1_positive ))
 }

str(SVMPredict )

# Let's compute the average value for accuracy and f1

acc_sv_avg <- mean(SVMPredict[, 1] )
f1_sv_avg <- mean(colMeans(SVMPredict[-1] ))

# comparing the avg. accuracy with the avg. F1 value is always a good practice. You can use it as a diagnostic tool-
# If the values are very different, that implies that you are doing well with some class labels and poorly 
# with other class labels. In this case, not such a huge difference (being the 2 values very close to each other)

acc_sv_avg
f1_sv_avg

######################################################
######################################################
# Random Forest LOOP
######################################################
######################################################

# If you have not still done STEP 1 above, replicate it; otherwise skip this step

# STEP 2: the LOOP

# here we fix ntree=100 to make things faster!

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] 
  validation <- ttrain [folds$subsets[folds$which == i], ]
  set.seed(123)
  newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, do.trace=TRUE, ntree=100) 
  newpred <- predict(newrf,newdata=validation, type="class") 
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table, mode = "everything") 
  df.name<-paste0("conf.mat.rf",i)
  assign(df.name,df)
})

# STEP 3: the metrics

RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) 

for(i in  mget(ls(pattern = "conf.mat.rf")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative
F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive
RFPredict <- rbind(RFPredict , cbind(Accuracy , F1_negative, F1_positive))
 }

str(RFPredict )

# Let's compute the average value for accuracy and f1

acc_rf_avg <- mean(RFPredict [, 1] )
f1_rf_avg <- mean(colMeans(RFPredict [-1] ))

acc_rf_avg
f1_rf_avg

######################################################
######################################################
# Naive Bayes LOOP
######################################################
######################################################

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] 
  validation <- ttrain [folds$subsets[folds$which == i], ]
  newrf <-  multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train,  laplace = 1) 
  newpred <- predict(newrf,newdata=validation, type="class") 
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table, mode = "everything") 
  df.name<-paste0("conf.mat.nb",i)
  assign(df.name,df)
})

# STEP 3: the metrics
NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) 

for(i in  mget(ls(pattern = "conf.mat.nb")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative
F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive
NBPredict <- rbind(NBPredict , cbind(Accuracy , F1_negative, F1_positive))
 }

str(NBPredict )

# Let's compute the average value for accuracy and f1

acc_nb_avg <- mean(NBPredict [, 1] )
f1_nb_avg <- mean(colMeans(NBPredict [-1] ))

acc_nb_avg
f1_nb_avg 

######################################################
######################################################
# Let's compare the results we got via Naive Bayes, SVM and RF 
######################################################
######################################################

acc_nb_avg
acc_sv_avg
acc_rf_avg

f1_nb_avg
f1_sv_avg
f1_rf_avg

# Let's plot the results!

gb1 <- as.data.frame(acc_nb_avg )
colnames(gb1)[1] <- "Accuracy NB"
gb2 <- as.data.frame(acc_sv_avg )
colnames(gb2)[1] <- "Accuracy SV"
gb3 <- as.data.frame(acc_rf_avg )
colnames(gb3)[1] <- "Accuracy RF"
ac_tot <- cbind(gb1, gb2, gb3)
ac_tot
str(ac_tot)
df.long_ac_tot<-melt(ac_tot)
str(df.long_ac_tot)

p <- ggplot(df.long_ac_tot, aes(x=variable, y=value, color = variable)) +
geom_boxplot() + xlab("Algorithm") + ylab(label="Values of Accuracy") +
labs(title = "Cross-validation with k =5: values of Accuracy") + coord_flip() +
theme_bw() + theme(legend.position = "None")

gb1 <- as.data.frame(f1_nb_avg)
colnames(gb1)[1] <- "F1 NB"
gb2 <- as.data.frame(f1_sv_avg )
colnames(gb2)[1] <- "F1 SV"
gb3 <- as.data.frame(f1_rf_avg )
colnames(gb3)[1] <- "F1 RF"
f1_tot <- cbind(gb1, gb2, gb3)
f1_tot
str(f1_tot)
df.long_f1_tot<-melt(f1_tot)
str(df.long_f1_tot)

p2 <- ggplot(df.long_f1_tot, aes(x=variable, y=value, color = variable)) +
geom_boxplot() + xlab("Algorithm") + ylab(label="Values of F1") +
labs(title = "Cross-validation with k =5: values of F1") + coord_flip() +
theme_bw() + theme(legend.position = "None")

grid.arrange(p, p2,  nrow=2) # Plot everything together

# So in this particular case, NB appears to be better than the other 3 algorithms - yes! 
# But remember: you are employing a training-set consisting of just 500 documents 
# and NB usually does well in this scenario. Moreover...
# Wait a minute! In this example we kept the hyperparameters fixed! 
# But is there any other hyperameters configuration for RF, SVM, and NB that will improve the model?