rm(list=ls(all=TRUE))
getwd()
# setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(caret)
library(car)
library(xgboost)
library(reshape2)
library(gridExtra)
library(cvTools)
library(Ckmeans.1d.dp)

#####################################################
# FIRST STEP: let's prepare the training-set
#####################################################

# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
myCorpusTwitterTrain <- corpus(x)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20)  # 20 top words
train <- as.matrix(Dfm_train) # dense matrix

######################################################
######################################################
# Let's make CROSS-VALIDATION with k-fold=5 with a loop!
# HERE you HAVE JUST 2 classes for the outcome (negative/positive)
######################################################
######################################################

# Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command.
# But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret

# STEP 1: create the 5 folds
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds 
set.seed(123) # set the see for replicability
k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work!
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

######################################################
######################################################
# Gradient Boosting LOOP
######################################################
######################################################

# you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
x <- as.factor(Dfm_train@docvars$Sentiment)
x
table(x)
x <- as.numeric(x)
x
x[ x ==1 ] <-0
x[ x ==2 ] <-1
table(x)

Dfm_train@docvars$code <- x
str(Dfm_train)
table(Dfm_train@docvars$code)
table(Dfm_train@docvars$Sentiment)

# STEP 2: the LOOP

system.time(for(i in 1:k){
 train <- ttrain [folds$subsets[folds$which != i], ] 
  validation <- ttrain [folds$subsets[folds$which == i], ]
  set.seed(123)
  newrf <- xgboost(
  data = train,
  label =Dfm_train[folds$subsets[folds$which != i], ]@docvars$code,
  nrounds = 500,
 eta = 1, nthread = 4, 
   objective = "binary:logistic",  # for binary 
  eval_metric = "error", # binary classification error rate
  verbose = 0) # here "code" NOT "Sentiment"!!!
 newpred <- predict(newrf,newdata=validation, type="class") 
 newpred <- round(  newpred)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$code)
print(class_table)  
  df<-confusionMatrix( class_table, mode = "everything") 
  df.name<-paste0("conf.mat.xgb",i)
  assign(df.name,df)
})

# STEP 3: the metrics
XGBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) 
for(i in  mget(ls(pattern = "conf.mat.xgb")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
p <- as.data.frame((i)$byClass)
F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative
F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive
XGBPredict <- rbind(XGBPredict , cbind(Accuracy , F1_negative, F1_positive))
 }

XGBPredict [is.na(XGBPredict )] <- 0 
str(XGBPredict )

# Let's compare the average value for accuracy and f1

acc_xgb_avg <- mean(XGBPredict [, 1] )
f1_xgb_avg <- mean(colMeans(XGBPredict [-1] ))

acc_xgb_avg
f1_xgb_avg

######################################################
######################################################
# Now let's explore (let's tune!) the hyperparameters for the XGB
######################################################
######################################################

# As we discussed, there are several hyperparameters involved in a XGB. Among the most important ones:

# "eta" (learning rate: Controls how quickly the algorithm proceeds down the gradient descent. 
# Smaller values reduce the chance of overfitting but also increases the time to find the optimal fit, default=0.3)

# "nrounds" (number of trees) always. GBMs often require many trees; however, unlike random forests GBMs can overfit 
# so the goal is to find the optimal number of trees that minimize the loss function of interest with cross validation. 
# good number of trees to begin with: 1000 

# "max_depth" (maximum depth of a tree): the numbers of splits in each tree, which controls the complexity of the boosted 
# ensemble. Often a "max_depth"=1 works well, in which case each tree is a stump consisting of a single split. 
# More commonly, "max_depth"  is greater than 1 but it is unlikely "max_depth" >10 will be required. Default=6;

# "subsample": it controls whether or not you use a fraction of the available training observations. Using less than 100% of the training observations means 
# you are implementing a stochastic gradient descent! For example, setting it to 0.5 means that xgboost randomly collected half of the data instances 
# to grow trees and this will prevent overfitting and keep from getting stuck in a local minimum or plateau of the loss function gradient. 
# It makes computation shorter (because less data to analyse)) and keeps from getting stuck in a local minimum or plateau 
# of the loss function gradient. default=1

# Other possible hyperparamaters to be considered: 

# colsample_bytree (subsample ratio of columns when constructing each tree), default=1

# min_child_weight /minimum sum of instance weight needed in a child. If the tree partition step results in a leaf node 
# with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. 
# The larger, the more conservative the algorithm will be. Default=1

# NOTE: we use "nthread" (Number of threads) cause this implements parallelization in your computer. Faster!

# you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
x <- as.factor(Dfm_train@docvars$Sentiment)
x
table(x)
x <- as.numeric(x)
x
x[ x ==1 ] <-0
x[ x ==2 ] <-1
table(x)

Dfm_train@docvars$code <- x
str(Dfm_train)
table(Dfm_train@docvars$code)

train <- as.matrix(Dfm_train) # dense matrix

# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two:
# eta (1 and 2) and max_depth (1 and 2)
hyper_grid <- expand.grid(
  eta = c(1, 2),
  max_depth = c(1, 2),
  min_error = 0 ,                    # a place to dump results
   accuracy = 0                       # a place to dump results
)

nrow(hyper_grid)
hyper_grid

# A further nice feature is the early_stopping_rounds option. This allows us to tell the function to stop running if the 
# cross validated error does not improve for n continuous trees. 
# For example, the below model will stop if we see no improvement for 200 consecutive trees. 
# This feature will help us speed up the tuning process in the next section.
 
# grid search 
for(i in 1:nrow(hyper_grid)) {
  # create parameter list
  params <- list(
    eta = hyper_grid$eta[i],
    max_depth = hyper_grid$max_depth[i]
  )
  set.seed(123)
  # train model
  xgb.tune <- xgb.cv(
    params = params, # here you write params = params NOT ranges = params
    data = train,
    label = Dfm_train@docvars$code,
    nrounds = 500,
    nfold = 10,
       objective = "binary:logistic",  # for binary 
    verbose = 1,               # not silent,
   nthread = 4, 
    metrics = "error", # binary classification error rate
    early_stopping_rounds = 200 # stop if no improvement for 200 consecutive trees
  )

  # add min training error to grid
  hyper_grid$min_error[i] <- min(xgb.tune$evaluation_log$test_error_mean)
  hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

head(arrange(hyper_grid, min_error ), 10)

### let's add another hyperparameter such as subsample

hyper_grid <- expand.grid(
  eta = c(1 , 2),
  max_depth = c(1, 2),
  subsample = c(.5, 1),            # that's new!
  min_error = 0,                     # a place to dump results
    accuracy = 0                       # a place to dump results
)

nrow(hyper_grid)
hyper_grid

# grid search 
for(i in 1:nrow(hyper_grid)) {
  
  # create parameter list
  params <- list(
    eta = hyper_grid$eta[i],
    max_depth = hyper_grid$max_depth[i], 
   subsample = hyper_grid$subsample[i] # that's new!
  )

 # reproducibility
  set.seed(123)
  # train model
  xgb.tune2 <- xgb.cv(
    params = params,
    data = train,
    label = Dfm_train@docvars$code,
    nrounds = 500,
    nfold = 10,
      objective = "binary:logistic",  # for binary 
    verbose = 1,               # not silent,
   nthread = 4, 
     metrics="error", # binary classification error rate
    early_stopping_rounds = 50 # stop if no improvement for 50 consecutive trees
  )

  # add min training error to grid
  hyper_grid$min_error [i] <- min(xgb.tune2$evaluation_log$test_error_mean)
   hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

head(arrange(hyper_grid, min_error ), 10)

#####################################################
# let's prepare the training-set with 3 categories (this script works fine for any number of categories>2)
#####################################################

######################################################
######################################################
# which main changes when running an XGB?
######################################################
######################################################

uk_train <- read.csv("uk_train.csv")
str(uk_train)
myCorpusTwitterTrain <- corpus(uk_train)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) 
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
topfeatures(Dfm_train , 20)  # 20 top words
train <- as.matrix(Dfm_train) 

# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .530
prop.table(table(Dfm_train@docvars$Sentiment))

# REMEMBER: You need always to add the number of classes to be classified in the formula if it is a multi-categorical variable like now!
numberOfClasses <- length(unique(Dfm_train@docvars$Sentiment))
numberOfClasses 

# you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
x <- as.factor(Dfm_train@docvars$Sentiment)
x
table(x)
x <- as.numeric(x)
table(x)
x[ x ==1 ] <-0
x[ x ==2 ] <-1
x[ x ==3 ] <-2 # CHANGE!
table(x)

Dfm_train@docvars$code <- x
str(Dfm_train)
table(Dfm_train@docvars$code)
table(Dfm_train@docvars$Sentiment)

# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two:
# eta (1 and 2) and max_depth (1 and 2)
hyper_grid <- expand.grid(
  eta = c(1, 2),
  max_depth = c(1, 2),
  optimal_trees = 0,               # a place to dump results
  min_error = 0,                    # a place to dump results
     accuracy = 0                       # a place to dump results
)

nrow(hyper_grid) # 4 possibilities by crossing eta with max_depth
hyper_grid

# grid search 
for(i in 1:nrow(hyper_grid)) {
  
  # create parameter list
  params <- list(
    eta = hyper_grid$eta[i],
    max_depth = hyper_grid$max_depth[i]
  )
  set.seed(123)
  # train model
  xgb.tune <- xgb.cv(
    params = params, # here you write params = params NOT ranges = params
    data = train,
    label = Dfm_train@docvars$code,
    nrounds = 500,
    nfold = 10,
  objective = "multi:softmax",  # for multi-category # CHANGE!
   "num_class" = numberOfClasses, # add this line! # CHANGE!
    verbose = 1,               # not silent
     metrics="merror", # Exact matching error, used to evaluate multi-class classification # CHANGE
   nthread = 4, 
    early_stopping_rounds = 150 # stop if no improvement for 150 consecutive trees
  )

  # add min training error and trees to grid
  hyper_grid$optimal_trees[i] <- which.min(xgb.tune$evaluation_log$test_merror_mean) # CHANGE: "test_merror_mean" and not "test_error_mean"
  hyper_grid$min_error[i] <- min(xgb.tune$evaluation_log$test_merror_mean) # CHANGE: "test_merror_mean" and not "test_error_mean"
  hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

head(arrange(hyper_grid, min_error ), 10)

### also remember, if you use  objective = "multi:softmax" you can directly predict classes, w/o any need to round the probabilities given that
### objective = "multi:softmax" returns predicted class (not probabilities) contrary to objective = "binary:logistic"