rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest) 
library(caret)
library(naivebayes)
library(car)
library(cvTools)
library(reshape2)
library(dplyr)
library(xgboost)
library(Ckmeans.1d.dp)

#####################################################
# let's prepare the training-set with 3 categories (this script works fine for any number of categories>2)
#####################################################

# This is a training-set of tweets discussing about the 2014 European parliamentary elections in UK

uk_train <- read.csv("uk_train.csv")
str(uk_train)
myCorpusTwitterTrain <- corpus(uk_train)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE )
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) 
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
topfeatures(Dfm_train , 20)  # 20 top words
train <- as.matrix(Dfm_train) 

# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .53
prop.table(table(Dfm_train@docvars$Sentiment))

######################################################
######################################################
# which main changes? Compared to the script "Lab 4-A 2020 LUMACSS"
# consider the case of a SVM - but this applies to all the other scripts
######################################################
######################################################

# STEP 1: create the 5 folds
ttrain <- train 
set.seed(123) # set the see for replicability
k <- 5 
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

# STEP 2: the LOOP

system.time(for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
  validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
  set.seed(123)
  newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model 
# (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
  newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table,  mode = "everything") 
  df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5)  
  assign(df.name,df)
})

# STEP 3: the metrics

SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) ##### FIRST CHANGE
# Why 4 columns NOW? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive.
# According to the number of classes in your output variable, changes the number of columns to fill!!!

ls()
# we have created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results
# for example:
conf.mat.sv1
str(conf.mat.sv1)
conf.mat.sv1$overall[1] # overall accuracy
# now if you save conf.mat.sv1$byClass as a data frame and you check its structure, you can see that you have
# already present the three values of F1 for all the three classes!
ex <- as.data.frame(conf.mat.sv1$byClass)
str(ex)

for(i in  mget(ls(pattern = "conf.mat.sv")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
##### SECOND CHANGE: the following 4 lines; 
p <- as.data.frame((i)$byClass)
F1_negative <- p$F1[1] # save in the matrix the F1 value for negative
F1_neutral <- p$F1[2] # save in the matrix the F1 value for neutral
F1_positive  <- p$F1[3] # save in the matrix the F1 value for positive
SVMPredict <- rbind(SVMPredict , cbind(Accuracy , F1_negative , F1_neutral, F1_positive ))
 }

SVMPredict [is.na(SVMPredict )] <- 0 # if I get some NA for some categories with respect to F1 (this happens when BOTH precision and recall score for that category is 0), replace NA with 0 
str(SVMPredict )

# you see that we are not doing that well with the class "negative"

# Let's compare the average value for accuracy and f1

acc_sv_avg <- mean(SVMPredict[, 1] )
f1_sv_avg <- mean(colMeans(SVMPredict[-1] ))

acc_sv_avg
# our benchmark: accuracy .53
prop.table(table(Dfm_train@docvars$Sentiment))
f1_sv_avg

# you see here that we do not improve that much compared to our benchmark model
# moreover there is a wide gap between accuracy and the avg. value of f1: why? cause we are doing reasonably good with some class (neutral)
# and bad with the others. The presence of an imbalanced dataset could be a reason for that. So here it does not matter
# the type of fancy ML algorithm (and connceted hyper-parameters mix) you are employing. 
# The only way for you to improve the performance of a ML algorithm in the CV stage (before predicting the test-set) is going back 
# to the training-set and improving it (for example by adding more texts displaying the classes in which you are doing bad).
# Any ML algorithm on a poorly built training-set, is going always to be a poor ML algorithm in terms of performance

######################################################
######################################################
# which main changes? Compared to the script "Lab 4-A 2020 LUMACSS"? The XGB case
######################################################
######################################################

uk_train <- read.csv("uk_train.csv")
myCorpusTwitterTrain <- corpus(uk_train)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp")), remove_punct = TRUE, remove_numbers=TRUE, 
tolower = TRUE, remove_symbols=TRUE,  remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE )
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) 
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
train <- as.matrix(Dfm_train) 

# REMEMBER: You need always to add the number of classes to be classified in the formula if it is a multi-categorical variable like now!
numberOfClasses <- length(unique(Dfm_train@docvars$Sentiment))
numberOfClasses 

# you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
str(Dfm_train@docvars$Sentiment)
x <- as.factor(Dfm_train@docvars$Sentiment)
x <- as.numeric(x)
table(x)
table(Dfm_train@docvars$Sentiment)
x[ x ==1 ] <-0
x[ x ==2 ] <-1
x[ x ==3 ] <-2 # CHANGE!
table(x)

Dfm_train@docvars$code <- x
str(Dfm_train)
table(Dfm_train@docvars$code)
table(Dfm_train@docvars$Sentiment)

# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two:
# eta (1 and 2) and max_depth (1 and 2)
hyper_grid <- expand.grid(
  eta = c(1, 2),
  max_depth = c(1, 2),
  min_error = 0,                    # a place to dump results
     accuracy = 0                       # a place to dump results
)

nrow(hyper_grid) # 4 possibilities by crossing eta with max_depth
hyper_grid

# grid search 
for(i in 1:nrow(hyper_grid)) {
  
  # create parameter list
  params <- list(
    eta = hyper_grid$eta[i],
    max_depth = hyper_grid$max_depth[i]
  )
  set.seed(123)
  # train model
  xgb.tune <- xgb.cv(
    params = params, # here you write params = params NOT ranges = params
    data = train,
    label = Dfm_train@docvars$code,
    nrounds = 500,
    nfold = 5,
  objective = "multi:softmax",  # for multi-category # CHANGE!
   num_class = numberOfClasses, # add this line! # CHANGE!
    verbose = 1,               # not silent
     metrics="merror", # Exact matching error, used to evaluate multi-class classification # CHANGE
   nthread = 4, 
    early_stopping_rounds = 150 # stop if no improvement for 150 consecutive trees
  )

  # add min training error to grid
  hyper_grid$min_error[i] <- min(xgb.tune$evaluation_log$test_merror_mean) # CHANGE: "test_merror_mean" and not "test_error_mean"
  hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}

head(arrange(hyper_grid, min_error ), 10)

### also remember, if you use  objective = "multi:softmax" you can directly predict classes, w/o any need to round the probabilities given that
### objective = "multi:softmax" returns predicted class (not probabilities) contrary to objective = "binary:logistic"

######################################################
######################################################
# which main changes? Compared to the script "Lab 4-B 2020 LUMACSS"
# the second change is just related to the NB loop (but you should already know about it!)
######################################################
######################################################

# STEP 1: create the folds
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 10 folds 
set.seed(123) # set the see for replicability
k <- 5 # the number of folds
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)

for(i in 1:k){
  train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
  validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
for (j in seq(0.1, 2.5, by = 0.1)){ # here you can change the values as you want
  set.seed(123)
  newrf <-  multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train,  laplace = j) 
# (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
  newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)  
  df<-confusionMatrix( class_table,  mode = "everything") 
  df.name<-paste0("conf.mat.nb",i, sep = "/", j) # create the name for the object that will save the confusion matrix for each loop (=5)  
  assign(df.name,df)
}
}

NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) ##### FIRST CHANGE
# Why 4 columns NOW? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive.
# According to the number of classes in your output variable, changes the number of columns to fill!!!

for(i in  mget(ls(pattern = "conf.mat.nb")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
##### SECOND CHANGE: the following 4 lines; 
p <- as.data.frame((i)$byClass)
F1_negative <- p$F1[1] # save in the matrix the F1 value for negative
F1_neutral <- p$F1[2] # save in the matrix the F1 value for neutral
F1_positive <- p$F1[3] # save in the matrix the F1 value for positive
NBPredict <- rbind(NBPredict , cbind(Accuracy , F1_negative, F1_neutral, F1_positive))
 }

NBPredict <- data.frame(col1=vector())  

for(i in  mget(ls(pattern = "conf.mat.nb")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
NBPredict <- rbind(NBPredict , cbind(Accuracy))
 }

str(NBPredict)

nrow(NBPredict)/k # number of estimated values for Laplace
values <- nrow(NBPredict)/k
values

# the results in NBPredict are saved like that: first all the results of the first k-fold for all the values of Laplace, and so on.
# for example imagine that you have k-fold=5 and Laplace assume just 2 values: 0.5 and 1. Then the first two Accuracy results in NBPredict
# are the Accuracy results you get in k-fold=1 for Laplace first 0.5 and then 1; the third and fourth Accuracy results are the
# the Accuracy results you get in k-fold=2 for Laplace first 0.5 and then 1; and so on till k-fold=5

for (i in 1:values ) { # generate the list of numbers that correspond to all the k-folds results for each single value of Laplace 
id <- seq(i,nrow(NBPredict),values)
name <- paste0("index",i)
assign(name, id)
}

for(i in  mget(ls(pattern = "index")) ) { # extract the k-folds results for each value of Laplace 
id <- NBPredict [(i), ]
name <- paste0("laplace",i)
assign(name, id)
}

# Let's compare the average value for accuracy

NBresults <- data.frame(col1=vector()) # generate an empty database that you will fill with the average value of accuracy for each value of Laplace

for (i in 1:values){
database=get(paste0("laplace",i))
Avg_Accuracy <- mean(database )
NBresults <- rbind(NBresults , cbind(Avg_Accuracy))
}

NBresults$LaplaceValue <- as.factor(seq(0.5, 2.5, by = 0.5)) # remember to write it here the range of the Laplace values you are exploring!
head(NBresults [order(-NBresults $Avg_Accuracy),]) # sorting by "Accuracy"