rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(cvTools)
library(reshape2)
library(dplyr)
library(xgboost)
library(Ckmeans.1d.dp)

#####################################################
# FIRST STEP: let's prepare the training-set
#####################################################

# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
myCorpusTwitterTrain <- corpus(x)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE )
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20)  # 20 top words
train <- as.matrix(Dfm_train) # dense matrix

# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .524
prop.table(table(Dfm_train@docvars$Sentiment))

######################################################
######################################################
# Let's start to explore (let's tune!) the hyperparameters for the RF
######################################################
######################################################

# The main deafault hyperparameters in the case of a RF are the following ones: "ntree" (Number of trees to grow; default=500),
# "mtry" (Number of variables randomly sampled as candidates at each split, where p is number of variables in x; the default is =sqrt(p);
# In our case p=length(Dfm_train@Dimnames$features)=1012!) 

# "nodesize": Minimum size of terminal nodes. This controls the complexity of the trees. 
# Smaller node size allows for deeper, more complex trees while larger node results in shallower trees. 
# This is another bias-variance tradeoff where deeper trees introduce more variance (risk of overfitting) and shallower trees 
# introduce more bias (risk of not fully capturing unique patters and relatonships in the data). The default is nodesize=1

# "maxnodes": maximum number of terminal nodes. Another way to control the complexity of the trees. 
# More nodes equates to deeper, more complex trees and less nodes result in shallower trees.

# sampsize: the number of samples to train on. The default value is 63.25% of the training set since this is the expected value of 
# unique observations in the bootstrap sample. Lower sample sizes can reduce the training time but may introduce more bias 
# than necessary. Increasing the sample size can increase performance but at the risk of overfitting because it introduces more 
# variance. Typically, when tuning this parameter we stay near the 60-80% range.

# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two:
# ntree (100 and 200) and mtry (28 and 30) with nodesize fixed at 1

# we will use the "tune" command to run a grid-search. By default, tune implements a 10-folds CV. But you can control such values
# by using the command tune.control. In our case we set the folds to 5 to make things faster

hyper_grid <- expand.grid(
  nodesize=c(1),
  ntree=c(100, 200),
  mtry =c(28, 30),
  min_error = 0,                     # a place to dump results
  accuracy = 0                       # a place to dump results
)

nrow(hyper_grid) # 4 possibilities by crossing ntree with mtry 
hyper_grid

# if you want to add several values you can write something like:
# ntree= seq(100, 300, by = 10)

# to tune the RF, let's exmploy the function "tune"
# the default in this case is running a 10-fold cross validation

# grid search 
for(i in 1:nrow(hyper_grid)) {
  # create parameter list
  params <- list(
    nodesize= hyper_grid$nodesize[i],
   ntree= hyper_grid$ntree[i],
   mtry = hyper_grid$mtry [i]
  )
  set.seed(123)
# train model
rf.tune <- tune(randomForest, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train,
 ranges = params, do.trace=TRUE, tunecontrol = tune.control(cross = 5))
  # add min training error and accuracy to grid
   hyper_grid$min_error[i] <- min(rf.tune$performances$error)
   hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

# number of folds for CV
rf.tune$ sampling 

str(hyper_grid)
# let's see the results
head(arrange(hyper_grid, min_error ), 10)

######################################################
######################################################
# Now let's explore (let's tune!) the hyperparameters for the SVM
######################################################
######################################################

# Let's stick to the linear kernel.

# beyond C ("cost"), another  main hypermaters for a linear kernel is "epsilon" (the parameter controlling the width of the -insensitive zone included 
# in the insensitive-loss function (default: 0.1))
# Accordingly, I can investigate different combination of values for C (default: C=1) as well of epsilon

# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two (cost and epsilon)

hyper_grid <- expand.grid(
  cost = c(0.1, 1),
  epsilon = c(0.1, 3),
  min_error = 0,                     # a place to dump results
  accuracy = 0                      # a place to dump results
)

nrow(hyper_grid) # 4 possibilities by crossing cost with epsilon
hyper_grid

# grid search 
for(i in 1:nrow(hyper_grid)) {

  # create parameter list
  params <- list(
    cost = hyper_grid$cost[i],
   epsilon  = hyper_grid$epsilon [i]
  )
  set.seed(123)
  # train model
sv.tune <- tune(svm, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train,
            kernel="linear", ranges = params, tunecontrol = tune.control(cross = 5))
   hyper_grid$min_error[i] <- min(sv.tune$performances$error)
   hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

# number of folds for CV
sv.tune$ sampling 

# results
head(arrange(hyper_grid, min_error ), 10)

# in this specific case, there is not a lot of change. 
# Of course changing the values of c and epsilon through which looking for (for example by looking for values of c also >10), can change the final results

# for all other kernel (radial, polynomial), you also have "gamma" - a scaling parameter used to fit nonlinear boundaries 
# (default: 1/(data dimension)) - in our case: 1/length(train)
# for polynomial kernel you also have "degree" (default: 3) and "coef0" (default: 0)

# Let's explore a radial kernel
# to make things faster in the Lab, we let only gamma to change, while fixing the cost and the epsilon value to 1 and 0.3 respectively

hyper_grid <- expand.grid(
  cost = c(1),
  epsilon = c(0.3),
 gamma = c(0.001, 0.01, 0.1, 1),
  min_error = 0,                     # a place to dump results
   accuracy = 0                      # a place to dump results
)

nrow(hyper_grid) # 4 possibilities
hyper_grid

# grid search 
for(i in 1:nrow(hyper_grid)) {

  # create parameter list
  params <- list(
    cost = hyper_grid$cost[i],
   epsilon  = hyper_grid$epsilon [i],
   gamma = hyper_grid$gamma [i]
  )
  set.seed(123)
  # train model
sv.tune <- tune(svm, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train,
            kernel="radial", ranges = params, tunecontrol = tune.control(cross = 5))
  # add min training error and trees to grid
   hyper_grid$min_error[i] <- min(sv.tune$performances$error)
  hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

head(arrange(hyper_grid, min_error ), 10)

# best model here has gamma=0.001 when cost=1 and epsilon=0.3; note however how the error for the best model with radial kernel is worst
# than the one you get via linear kernel

######################################################
######################################################
# Now let's explore (let's tune!) the hyperparameters for the XGB
######################################################
######################################################

# As we discussed, there are several hyperparameters involved in a XGB. Among the most important ones:

# "eta" (learning rate: Controls how quickly the algorithm proceeds down the gradient descent. 
# Smaller values reduce the chance of overfitting but also increases the time to find the optimal fit, default=0.3)

# "nrounds" (number of trees) always. GBMs often require many trees; however, unlike random forests GBMs can overfit 
# so the goal is to find the optimal number of trees that minimize the loss function of interest with cross validation. 
# good number of trees to begin with: 1000 

# "max_depth" (maximum depth of a tree): the numbers of splits in each tree, which controls the complexity of the boosted 
# ensemble. Often a "max_depth"=1 works well, in which case each tree is a stump consisting of a single split. 
# More commonly, "max_depth"  is greater than 1 but it is unlikely "max_depth" >10 will be required. Default=6;

# "subsample": it controls whether or not you use a fraction of the available training observations. Using less than 100% of the training observations means 
# you are implementing a stochastic gradient descent! For example, setting it to 0.5 means that xgboost randomly collected half of the data instances 
# to grow trees and this will prevent overfitting and keep from getting stuck in a local minimum or plateau of the loss function gradient. 
# It makes computation shorter (because less data to analyse)) and keeps from getting stuck in a local minimum or plateau 
# of the loss function gradient. default=1

# Other possible hyperparamaters to be considered: 

# colsample_bytree (subsample ratio of columns when constructing each tree), default=1

# min_child_weight /minimum sum of instance weight needed in a child. If the tree partition step results in a leaf node 
# with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. 
# The larger, the more conservative the algorithm will be. Default=1

# NOTE: we use "nthread" (Number of threads) cause this implements parallelization in your computer. Faster!

# you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
x <- as.factor(Dfm_train@docvars$Sentiment)
x
table(x)
x <- as.numeric(x)
x
x[ x ==1 ] <-0
x[ x ==2 ] <-1
table(x)

Dfm_train@docvars$code <- x
str(Dfm_train)
table(Dfm_train@docvars$code)

train <- as.matrix(Dfm_train) # dense matrix

# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two:
# eta (1 and 2) and max_depth (1 and 2)
hyper_grid <- expand.grid(
  eta = c(1, 2),
  max_depth = c(1, 2),
  min_error = 0 ,                    # a place to dump results
   accuracy = 0                       # a place to dump results
)

nrow(hyper_grid)
hyper_grid

# We will employ in this case the xgb.cv command with, once again, a number of folds for CV=5.
# A further nice feature of the xgb.cv command is the early_stopping_rounds option. This allows us to tell the function to stop running if the 
# cross validated error does not improve for n continuous trees. 
# For example, the below model will stop if we see no improvement for 200 consecutive trees. 
# This feature will help us speed up the tuning process in the next section.
 
# grid search 
for(i in 1:nrow(hyper_grid)) {
  # create parameter list
  params <- list(
    eta = hyper_grid$eta[i],
    max_depth = hyper_grid$max_depth[i]
  )
  set.seed(123)
  # train model
  xgb.tune <- xgb.cv(
    params = params, # here you write params = params NOT ranges = params
    data = train,
    label = Dfm_train@docvars$code,
    nrounds = 500,
    nfold = 5, # number of folds for CV
       objective = "binary:logistic",  # for binary 
    verbose = 1,               # not silent,
   nthread = 4, 
    metrics = "error", # binary classification error rate
    early_stopping_rounds = 200 # stop if no improvement for 200 consecutive trees
  )

  # add min training error to grid
  hyper_grid$min_error[i] <- min(xgb.tune$evaluation_log$test_error_mean)
  hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

# number of folds for the CV
length(xgb.tune$folds)

# results
head(arrange(hyper_grid, min_error ), 10)

### let's add another hyperparameter such as subsample

hyper_grid <- expand.grid(
  eta = c(1 , 2),
  max_depth = c(1, 2),
  subsample = c(.5, 1),            # that's new!
  min_error = 0,                     # a place to dump results
    accuracy = 0                       # a place to dump results
)

nrow(hyper_grid)
hyper_grid

# grid search 
for(i in 1:nrow(hyper_grid)) {
  
  # create parameter list
  params <- list(
    eta = hyper_grid$eta[i],
    max_depth = hyper_grid$max_depth[i], 
   subsample = hyper_grid$subsample[i] # that's new!
  )

 # reproducibility
  set.seed(123)
  # train model
  xgb.tune2 <- xgb.cv(
    params = params,
    data = train,
    label = Dfm_train@docvars$code,
    nrounds = 500,
    nfold = 5,
      objective = "binary:logistic",  # for binary 
    verbose = 1,               # not silent,
   nthread = 4, 
     metrics="error", # binary classification error rate
    early_stopping_rounds = 50 # stop if no improvement for 50 consecutive trees
  )

  # add min training error to grid
  hyper_grid$min_error [i] <- min(xgb.tune2$evaluation_log$test_error_mean)
   hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

head(arrange(hyper_grid, min_error ), 10)

######################################################
######################################################
# Now let's explore (let's tune!) the hyperparameters for the NB!
######################################################
######################################################

# The main hyperparameter is the value of Laplace.
# In the NB case you cannot use the "tune" function, so we will do with a different script. A bit more complicated, but
# once written, you can always use that!
# Let's see an example of changing the value of Laplace from 0.5 to 2.5 by 0.5

# STEP 1: create the folds
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds 
set.seed(123) # set the see for replicability
k <- 5 # the number of folds
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
  validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
for (j in seq(0.5, 2.5, by = 0.5)){ # here you can change the values as you want
  set.seed(123)
  newrf <-  multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train,  laplace = j) 
# (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
  newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table,  mode = "everything") 
  df.name<-paste0("conf.mat.nb",i, sep = "/", j) # create the name for the object that will save the confusion matrix for each loop (=5)  
  assign(df.name,df)
}
}

NBPredict <- data.frame(col1=vector())  

for(i in  mget(ls(pattern = "conf.mat.nb")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
NBPredict <- rbind(NBPredict , cbind(Accuracy))
 }

str(NBPredict)

nrow(NBPredict)/k # number of estimated values for Laplace
values <- nrow(NBPredict)/k
values

# the results in NBPredict are saved like that: first all the results of the first k-fold for all the values of Laplace, and so on.
# for example imagine that you have k-fold=5 and Laplace assume just 2 values: 0.5 and 1. Then the first two Accuracy results in NBPredict
# are the Accuracy results you get in k-fold=1 for Laplace first 0.5 and then 1; the third and fourth Accuracy results are the
# the Accuracy results you get in k-fold=2 for Laplace first 0.5 and then 1; and so on till k-fold=5

for (i in 1:values ) { # generate the list of numbers that correspond to all the k-folds results for each single value of Laplace 
id <- seq(i,nrow(NBPredict),values)
name <- paste0("index",i)
assign(name, id)
}

for(i in  mget(ls(pattern = "index")) ) { # extract the k-folds results for each value of Laplace 
id <- NBPredict [(i), ]
name <- paste0("laplace",i)
assign(name, id)
}

# Let's compare the average value for accuracy

NBresults <- data.frame(col1=vector()) # generate an empty database that you will fill with the average value of accuracy for each value of Laplace

for (i in 1:values){
database=get(paste0("laplace",i))
Avg_Accuracy <- mean(database )
NBresults <- rbind(NBresults , cbind(Avg_Accuracy))
}

NBresults$LaplaceValue <- as.factor(seq(0.5, 2.5, by = 0.5)) # remember to write it here the range of the Laplace values you are exploring!
head(NBresults [order(-NBresults $Avg_Accuracy),]) # sorting by "Accuracy"

# You see that there is indeed some change, but usually an hyper-grid search is going to improve not dramatically with respect
# to the values you get when running a ML algorithm with its default hyperparamters values

######################################################
######################################################
# Once you have estimated the best hyperparameters setting for the ML, you could replicate the
# K-fold analysis to see now which is the most advisable ML algorithm (given your training-set) 
######################################################
######################################################