rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(cvTools)
library(reshape2)
library(dplyr)
library(e1071)

#####################################################
# FIRST STEP: let's prepare the training-set
#####################################################

# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm( tok2)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20)  # 20 top words

# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .524
prop.table(table(Dfm_train@docvars$Sentiment))

train <- as.matrix(Dfm_train)  # let's convert the dfm into a (dense) matrix

# Note: usually an hypergrid search can improve your Model Fitting with respect
# to the values you get when running a ML algorithm with its default hyperparamters values;
# but do not expect any drammatic change!

######################################################
######################################################
# Let's start to explore (let's tune!) the hyperparameters for the RF
######################################################
######################################################

# The main deafault hyperparameters in the case of a RF are the following ones: 

# 1) "ntree" (Number of trees to grow; default=500),

# 2) "mtry" (Number of variables randomly sampled as candidates at each split, where p is number of variables in x; the default is =sqrt(p);
# In our case p=nfeat(Dfm_train)=1139!; sqrt(p)=33.75) 

# 3) "nodesize": Minimum size of terminal nodes. This controls the complexity of the trees. 
# Smaller node size allows for deeper, more complex trees while larger node results in shallower trees. 
# This is another bias-variance tradeoff where deeper trees introduce more variance (risk of overfitting) and shallower trees 
# introduce more bias (risk of not fully capturing unique patters and relatonships in the data). The default is nodesize=1

# 4) "sampsize": the number of samples to train on. The default value is 63.2% (i.e., sampsize=.632*nrow(x)) of the training set 
# since this is the expected value of unique observations in the bootstrap sample 
# In our case:
nrow(train)
round(.632*nrow(train))
# Typically, when tuning this parameter we stay near the 60-80% range. However, it also depends on how large is your training-set! 
# With a very little training-set, increasing sampsize could be a reasonably choice

# Let's create an hyperparameter grid. We can add as many values and hyperparameters you want. 
# Here just two: ntree (50 and 100) and mtry (28 and 34) with nodesize fixed at 1 and sampisize fixed at 63.2% just to save time

hyper_grid <- expand.grid(
  ntree=c(50, 100),
  mtry =c(28, 34),
  min_error = 0,                     # a place to dump results
  accuracy = 0                       # a place to dump results
)

nrow(hyper_grid) # 4 possibilities by crossing ntree with mtry 
hyper_grid

# if you want to add several values you can write something like:
# ntree= seq(100, 300, by = 10)

# to tune the RF, let's exmploy the function "tune".
# By default, tune implements a 10-folds CV. But you can control such values
# by using the command tune.control. In our case we set the folds to 5 to make things faster

# grid search 
for(i in 1:nrow(hyper_grid)) {
  # create parameter list
  params <- list(
   ntree= hyper_grid$ntree[i],
   mtry = hyper_grid$mtry [i]
  )
  set.seed(123)
# train model
rf.tune <- tune(randomForest, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train,
 ranges = params, do.trace=TRUE, tunecontrol = tune.control(cross = 5))
  # add min training error and accuracy to grid
   hyper_grid$min_error[i] <- min(rf.tune$performances$error)
   hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

# number of folds for CV
rf.tune$ sampling 

str(hyper_grid)
# let's see the results
head(arrange(hyper_grid, min_error ))

# Note that using "tune" we get by default only the  estimate accuracy and not F1. Doing the latter would require some extra programming.
# However now we are selecting the best hyperparameters setting for each given ML; 
# then we have to go back to our previous script to run a full CV (with F1 included) across different MLs. 
# The ratio to focus only on accuracy is that, for a given ML, there is a correlation between accuracy and F1, and if a RF model is doing better 
# than another RF model in terms of accuracy, it will probably do the same also in terms of F1

######################################################
######################################################
# Now let's explore (let's tune!) the hyperparameters for the NB!
######################################################
######################################################

# The main hyperparameter is the value of Laplace.
# In the NB case you cannot use the "tune" function, so we will do with a different script. 
# Let's see an example of changing the value of Laplace from 0.5 to 2.5 by 0.5

# STEP 1: create the folds
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds 
set.seed(123) # set the see for replicability
k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work!
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

# let's create an empty dataframe with 3 columns
NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector())  

for (j in seq(0.5, 3.5, by = 0.5)) { # you can change the values of the Laplace hyperparameter as you like. Here between 0.5 and 3.5 by 0.5
for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
  validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
  newrf <-  multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train,  laplace = j) # here you change the Laplace value
  newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data)
  class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
  print(class_table)
  df<-confusionMatrix(class_table,  mode = "everything")   
  Accuracy <-df$overall[1]
  min_error <- 1-Accuracy
  laplace=j
  NBPredict <- rbind(NBPredict , cbind(laplace,Accuracy ,min_error)) 
}
}

NBPredict
# let's compute the avg. value for each Laplace value of Accuracy and min_error
aggrLaplace <- aggregate(NBPredict[,c(2:3)], list(NBPredict$laplace), FUN=mean) 
str(aggrLaplace )
colnames(aggrLaplace )[1] ="Laplace"
aggrLaplace [order(-aggrLaplace$Accuracy),]

######################################################
######################################################
# Once you have estimated the best hyperparameters setting for the ML, you could replicate the
# K-fold analysis to see now which is the most advisable ML algorithm (given your training-set) 
######################################################
######################################################