rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(stringr)
library(cvTools)
library(magicfor)
library(car) 
library(reshape2)
library(gridExtra)
library(cvTools)

#### let's repeat all the steps discussed the last time to build the training and the test for NB, RF and SVM

################################################################
# FIRST STEP: create the DFM for the training-set
################################################################

x11 <- read.csv("trainTrump.csv", stringsAsFactors=FALSE)
x11$text <- str_replace_all(x11$text, "[^[:alnum:]]", " ")
myCorpusTwitterTrain <- corpus(x11)
Dfm_train<- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)

################################################################
# SECOND STEP: create the DFM for the test-set
################################################################

x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE)
x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ")
myCorpusTwitterTest <- corpus(x10)
Dfm_test<- dfm(myCorpusTwitterTest, remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),  remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)

################################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
################################################################
test_dfm  <- dfm_match(Dfm_test, features = featnames(Dfm_train))

################################################################
# FOURTH STEP/B
# transform both dfm (train and test) in a data frame
################################################################

train <- as.data.frame(as.matrix(Dfm_train))
test <- as.data.frame(as.matrix(test_dfm))
colnames(train ) <- make.names(colnames(train ))
colnames(test ) <- make.names(colnames(test ))

######################################################
######################################################
# Let's make CROSS-VALIDATION with k-fold=5 with a loop!
######################################################
######################################################

# Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command.
# But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret

######################################################
######################################################
# Let's start with SV
######################################################
######################################################

# STEP 1: create the folds
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds (this time we will use a slight different line of command - make things faster here)
set.seed(123) # set the see for replicability
k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work!
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

# STEP 2: the LOOP

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
  validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
  set.seed(123)
  newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model 
# (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
  newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table,  mode = "everything") 
  df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5)  
  assign(df.name,df)
})

# STEP 3: the metrics
ls()
# we have just created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results

SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) # makes a blank data frame with four columns to fill with the predictions; 
# why 4 columns? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive. According to the number of classes
# in your output variable, changes the number of columns to fill!!!
for(i in  mget(ls(pattern = "conf.mat.sv")) ) {
col1 <-(i)$overall[1] # save in the matrix the accuracy value
p <- as.data.frame((i)$byClass)
col2 <- p$F1[1] # save in the matrix the F1 value for negative
col3 <- p$F1[2] # save in the matrix the F1 value for neutral
col4 <- p$F1[3] # save in the matrix the F1 value for positive
SVMPredict <- rbind(SVMPredict , cbind(col1, col2, col3, col4))
 }

colnames(SVMPredict )[1] <- "Accuracy"
colnames(SVMPredict )[2] <- "F1 Negative"
colnames(SVMPredict )[3] <- "F1 Neutral"
colnames(SVMPredict )[4] <- "F1 Positive"
SVMPredict [is.na(SVMPredict )] <- 0 # if I get some NA for some categories with respect to F1 (this happens when BOTH precision and recall score for that category is 0), replace NA with 0 
SVMPredict
str(SVMPredict )

# Let's compare the average value for accuracy and f1

acc_sv_avg <- mean(SVMPredict[, 1] )
f1_sv_avg <- mean(colMeans(SVMPredict[-1] ))

acc_sv_avg
f1_sv_avg

######################################################
######################################################
# Random Forest LOOP
######################################################
######################################################

# If you have not still done STEP 1 above, replicate it; otherwise skip this step

# STEP 2: the LOOP

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] 
  validation <- ttrain [folds$subsets[folds$which == i], ]
  set.seed(123)
  newrf <- randomForest(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, do.trace=TRUE, ntree=100) 
  newpred <- predict(newrf,newdata=validation, type="class") 
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table, mode = "everything") 
  df.name<-paste0("conf.mat.rf",i)
  assign(df.name,df)
})

# STEP 3: the metrics

RFPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) 
for(i in  mget(ls(pattern = "conf.mat.rf")) ) {
col1 <-(i)$overall[1] # save in the matrix the accuracy value
p <- as.data.frame((i)$byClass)
col2 <- p$F1[1] # save in the matrix the F1 value for negative
col3 <- p$F1[2] # save in the matrix the F1 value for neutral
col4 <- p$F1[3] # save in the matrix the F1 value for positive
RFPredict <- rbind(RFPredict , cbind(col1, col2, col3, col4))
 }

colnames(RFPredict )[1] <- "Accuracy"
colnames(RFPredict )[2] <- "F1 Negative"
colnames(RFPredict )[3] <- "F1 Neutral"
colnames(RFPredict )[4] <- "F1 Positive"
RFPredict [is.na(RFPredict )] <- 0 
RFPredict 
str(RFPredict )

# Let's compare the average value for accuracy and f1

acc_rf_avg <- mean(RFPredict [, 1] )
f1_rf_avg <- mean(colMeans(RFPredict [-1] ))

acc_rf_avg
f1_rf_avg

######################################################
######################################################
# Naive Bayes LOOP
######################################################
######################################################

# STEP 1: create the folds (here let's follow our usual procedure that we saw in the "CV first part" script) 

summary( myCorpusTwitterTrain )
N <- ndoc(myCorpusTwitterTrain )
k <- 5 # Number of desired folds
set.seed(123)
holdout <- split(sample(1:N), 1:k)
docvars(myCorpusTwitterTrain , "id_numeric") <- 1:ndoc(myCorpusTwitterTrain )

# STEP 2: the LOOP

system.time(for(i in 1:k){
train <- dfm(corpus_subset(myCorpusTwitterTrain , id_numeric %in% holdout[[i]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE) # Set the training set
train <- dfm_trim(train,min_docfreq= 2, verbose=TRUE) # Trim the dfm
validation <- dfm(corpus_subset(myCorpusTwitterTrain ,  !id_numeric %in% holdout[[i]]), remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("ā"),
("com"), ("ly")),remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE) # Set the validation set
validation  <- dfm_trim(validation , min_docfreq= 2, verbose=TRUE) # Trim the dfm
set.seed(123)
newrf <- textmodel_nb(train, docvars(train, "Sentiment"), distribution = c("multinomial")) # Get your new model (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
validation <- dfm_select(validation, train) # Let's make the features identical by passing training_dfm to dfm_select() as a pattern
newpred <- predict(newrf,validation) # Get the predicitons for the validation set (from the model just fit on the train data)
actual_class <- docvars(validation, "Sentiment") # Add name output (in this case "Sentiment")
predicted_class <- predict(newrf, newdata = validation)
tab_class <- table(predicted_class, actual_class)
 df<-confusionMatrix( tab_class,  mode = "everything") # Add name output (in this case "Sentiment")
  df.name<-paste0("conf.mat.nb",i)
  assign(df.name,df)
})

# STEP 3: the metrics

NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) 
for(i in  mget(ls(pattern = "conf.mat.nb")) ) {
col1 <-(i)$overall[1] # save in the matrix the accuracy value
p <- as.data.frame((i)$byClass)
col2 <- p$F1[1] # save in the matrix the F1 value for negative
col3 <- p$F1[2] # save in the matrix the F1 value for neutral
col4 <- p$F1[3] # save in the matrix the F1 value for positive
NBPredict <- rbind(NBPredict , cbind(col1, col2, col3, col4))
 }

colnames(NBPredict )[1] <- "Accuracy"
colnames(NBPredict )[2] <- "F1 Negative"
colnames(NBPredict )[3] <- "F1 Neutral"
colnames(NBPredict )[4] <- "F1 Positive"
NBPredict [is.na(NBPredict )] <- 0 
NBPredict 
str(NBPredict )

# Let's compare the average value for accuracy and f1

acc_nb_avg <- mean(NBPredict [, 1] )
f1_nb_avg <- mean(colMeans(NBPredict [-1] ))

acc_nb_avg
f1_nb_avg 

######################################################
######################################################
# Let's compare the results we got via Naive Bayes, SVM and RF
######################################################
######################################################

acc_nb_avg
acc_sv_avg
acc_rf_avg

f1_nb_avg
f1_sv_avg
f1_rf_avg

# Let's plot the results!

gb1 <- as.data.frame(acc_nb_avg )
colnames(gb1)[1] <- "Accuracy NB"
gb2 <- as.data.frame(acc_sv_avg )
colnames(gb2)[1] <- "Accuracy SV"
gb3 <- as.data.frame(acc_rf_avg )
colnames(gb3)[1] <- "Accuracy RF"
ac_tot <- cbind(gb1, gb2, gb3)
ac_tot
str(ac_tot)
df.long_ac_tot<-melt(ac_tot)
str(df.long_ac_tot)

p <- ggplot(df.long_ac_tot, aes(x=variable, y=value)) + 
   geom_boxplot() + xlab("Algorithm") + ylab(label="Value") + 
ggtitle("Naive Bayes vs. SVM vs. RF K-fold cross-validation (K=5): Accuracy") + coord_flip() 

gb1 <- as.data.frame(f1_nb_avg)
colnames(gb1)[1] <- "F1 NB"
gb2 <- as.data.frame(f1_sv_avg )
colnames(gb2)[1] <- "F1 SV"
gb3 <- as.data.frame(f1_rf_avg )
colnames(gb3)[1] <- "F1 RF"
f1_tot <- cbind(gb1, gb2, gb3)
f1_tot
str(f1_tot)
df.long_f1_tot<-melt(f1_tot)
str(df.long_f1_tot)

p2 <- ggplot(df.long_f1_tot, aes(x=variable, y=value)) + 
   geom_boxplot() + xlab("Algorithm") + ylab(label="Value") + 
ggtitle("Naive Bayes vs. SVM vs. RF K-fold cross-validation (K=5): F1") + coord_flip() 

grid.arrange(p, p2,  nrow=2) # Plot everything together

# So in this particular case, SVM better than the other 2 algorithms? Wait a minute! In this example we kept the hyperparameters fixed! 
# But is there any other hyperameters configuration for RF and SVM that will improve the model [for NB in Quanteda, no hyperparameters need to be changed]?