rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(cvTools)
library(reshape2)
library(dplyr)
#####################################################
# FIRST STEP: let's prepare the training-set
#####################################################
# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm( tok2)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20) # 20 top words
# Always checking if after trimming you have some texts with just 0s!
# no problem here
Dfm_train[ntoken(Dfm_train) == 0,]
# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .524
prop.table(table(Dfm_train@docvars$Sentiment))
train <- as.matrix(Dfm_train) # let's convert the dfm into a (dense) matrix
# Note: usually an hypergrid search can improve your Model Fitting with respect
# to the values you get when running a ML algorithm with its default hyperparamters values;
# but do not expect any drammatic change!
######################################################
######################################################
# Let's start to explore (let's tune!) the hyperparameters for the RF
######################################################
######################################################
# The main deafault hyperparameters in the case of a RF are the following ones:
# 1) "ntree" (Number of trees to grow; default=500),
# 2) "mtry" (Number of variables randomly sampled as candidates at each split, where p is number of variables in x; the default is =sqrt(p);
# In our case p=length(Dfm_train@Dimnames$features)=1139!; sqrt(p)=33.75)
# 3) "nodesize": Minimum size of terminal nodes. This controls the complexity of the trees.
# Smaller node size allows for deeper, more complex trees while larger node results in shallower trees.
# This is another bias-variance tradeoff where deeper trees introduce more variance (risk of overfitting) and shallower trees
# introduce more bias (risk of not fully capturing unique patters and relatonships in the data). The default is nodesize=1
# 4) sampsize: the number of samples to train on. The default value is 63.25% of the training set since this is the expected value of
# unique observations in the bootstrap sample. Lower sample sizes can reduce the training time but may introduce more bias
# than necessary. Increasing the sample size can increase performance but at the risk of overfitting because it introduces more
# variance. Typically, when tuning this parameter we stay near the 60-80% range. However, it also depends on how large is your
# training-set! With a very little training-set, increasing sampsize could be a reasonably choice
# Let's create an hyperparameter grid. We can add as many values and hyperparameters you want.
# Here just two: ntree (100 and 200) and mtry (28 and 30) with nodesize fixed at 1
hyper_grid <- expand.grid(
ntree=c(100, 200),
mtry =c(28, 30),
min_error = 0, # a place to dump results
accuracy = 0 # a place to dump results
)
nrow(hyper_grid) # 4 possibilities by crossing ntree with mtry
hyper_grid
# if you want to add several values you can write something like:
# ntree= seq(100, 300, by = 10)
# to tune the RF, let's exmploy the function "tune".
# By default, tune implements a 10-folds CV. But you can control such values
# by using the command tune.control. In our case we set the folds to 5 to make things faster
# grid search
for(i in 1:nrow(hyper_grid)) {
# create parameter list
params <- list(
ntree= hyper_grid$ntree[i],
mtry = hyper_grid$mtry [i]
)
set.seed(123)
# train model
rf.tune <- tune(randomForest, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train,
ranges = params, do.trace=TRUE, tunecontrol = tune.control(cross = 5))
# add min training error and accuracy to grid
hyper_grid$min_error[i] <- min(rf.tune$performances$error)
hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}
# number of folds for CV
rf.tune$ sampling
str(hyper_grid)
# let's see the results
head(arrange(hyper_grid, min_error ))
# Note that here we just estimate accuracy and not F1. However now we are selecting the best hyperparameters setting for each given ML;
# then we have to go back to our previous script to run a full CV (with F1 included) across different MLs.
# The ratio to focus only on accuracy is that, for a given ML, there is a correlation between accuracy and F1, and if a RF model is doing better
# than another RF model in terms of accuracy, it will almost for sure do the same also in terms of F1
######################################################
######################################################
# Now let's explore (let's tune!) the hyperparameters for the SVM
######################################################
######################################################
# Let's stick to the linear kernel.
# Beyond C ("cost"), another main hypermater for a linear kernel is "epsilon" (the parameter controlling the width of the insensitive zone included
# in the insensitive-loss function (default: 0.1))
# Accordingly, we can investigate different combination of values for C (default: C=1) as well of epsilon
# create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two (cost and epsilon)
hyper_grid <- expand.grid(
cost = c(0.1, 1),
epsilon = c(0.1, 3),
min_error = 0, # a place to dump results
accuracy = 0 # a place to dump results
)
nrow(hyper_grid) # 4 possibilities by crossing cost with epsilon
hyper_grid
# grid search
for(i in 1:nrow(hyper_grid)) {
# create parameter list
params <- list(
cost = hyper_grid$cost[i],
epsilon = hyper_grid$epsilon [i]
)
set.seed(123)
# train model
sv.tune <- tune(svm, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train,
kernel="linear", ranges = params, tunecontrol = tune.control(cross = 5))
hyper_grid$min_error[i] <- min(sv.tune$performances$error)
hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i]
}
# number of folds for CV
sv.tune$ sampling
head(arrange(hyper_grid, min_error ))
# In this specific case, there is no change.
# Of course changing the values of C and epsilon through which looking for (for example by looking for values of c also >10),
# can change the final results
# For the other kernels (radial, polynomial), you also have the "gamma" hyperparameter - a scaling parameter used to fit nonlinear boundaries.
# Intuitively, the gamma parameter defines how far the influence of a single training example reaches,
# with low values meaning "far" and high values meaning "close".
# If gamma is very large then we get quiet fluctuating and wiggly decision boundaries which accounts for high variance and overfitting.
# If gamma is small,the decision boundary is smoother and has low variance.
# (default: 1/(data dimension)) - in our case: 1/length(train)
# Finally, for polynomial kernel you also have "degree" (default: 3) and "coef0" (default: 0)
# Let's explore a radial kernel (the hyperparameter epsilon does not apply here)
hyper_grid2 <- expand.grid(
cost = c(0.1, 1),
gamma = c(0.001, 0.01, 1),
min_error = 0, # a place to dump results
accuracy = 0 # a place to dump results
)
nrow(hyper_grid2) # 6 possibilities
hyper_grid2
# grid search
for(i in 1:nrow(hyper_grid2)) {
# create parameter list
params <- list(
cost = hyper_grid2$cost[i],
gamma = hyper_grid2$gamma [i]
)
set.seed(123)
# train model
sv.tune <- tune(svm, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train,
kernel="radial", ranges = params, tunecontrol = tune.control(cross = 5))
# add min training error and trees to grid
hyper_grid2$min_error[i] <- min(sv.tune$performances$error)
hyper_grid2$accuracy[i] <- 1-hyper_grid2$min_error[i]
}
# number of folds for CV
sv.tune$ sampling
head(arrange(hyper_grid2, min_error ), 10)
head(arrange(hyper_grid, min_error ), 10)
# best model here has gamma=0.001 when cost=1; note however how the error for the best model is worst
# than the one you get via linear kernel (not surpisingly as a result. As already discussed in class, with text-classification,
# a linear kernel usally is the best one)
######################################################
######################################################
# Now let's explore (let's tune!) the hyperparameters for the NB!
######################################################
######################################################
# The main hyperparameter is the value of Laplace.
# In the NB case you cannot use the "tune" function, so we will do with a different script.
# A bit more complicated, but once written, you can always use that (with some minor modifications)!
# Let's see an example of changing the value of Laplace from 0.5 to 2.5 by 0.5
# STEP 1: create the folds
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds
set.seed(123) # set the see for replicability
k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work!
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)
for(i in 1:k){
train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set
validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set
for (j in seq(0.5, 2.5, by = 0.5)){ # here you can change the values of the Laplace hyperparameter
newrf <- multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, laplace = j)
# (just fit on the train data) and ADD the name of the output (in this case "Sentiment")
newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment)
print(class_table)
df<-confusionMatrix( class_table, mode = "everything")
df.name<-paste0("conf.mat.nb",i, sep = "/", j) # create the name for the object that will save the confusion matrix for each loop (=5)
assign(df.name,df)
}
}
NBPredict <- data.frame(col1=vector(), col2=vector())
for(i in mget(ls(pattern = "conf.mat.nb")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value - we could have also saved the matrix for the F1 values as well
min_error <- 1-Accuracy
NBPredict <- rbind(NBPredict , cbind(Accuracy ,min_error))
}
str(NBPredict)
nrow(NBPredict)/k # number of estimated values for Laplace
values <- nrow(NBPredict)/k
values
# the results in NBPredict are saved like that: first all the results of the first k-fold for all the values of Laplace, and so on.
# for example imagine that you have k-fold=5 and Laplace assume just 2 values: 0.5 and 1. Then the first two Accuracy results in NBPredict
# are the Accuracy results you get in k-fold=1 for Laplace first 0.5 and then 1; the third and fourth Accuracy results are the
# the Accuracy results you get in k-fold=2 for Laplace first 0.5 and then 1; and so on till k-fold=5
for (i in 1:values ) { # generate the list of numbers that correspond to all the k-folds results for each single value of Laplace
id <- seq(i,nrow(NBPredict),values)
name <- paste0("index",i)
assign(name, id)
}
for(i in mget(ls(pattern = "index")) ) { # extract the k-folds results for each value of Laplace
id <- NBPredict [(i), ]
name <- paste0("laplace",i)
assign(name, id)
}
# Let's compare the average value for accuracy and f1
NBresults <- data.frame(col1=vector(), col2=vector()) # generate an empty database that you will fill with the average value of
# accuracy and min_error for each value of Laplace
for (i in 1:values){
database=get(paste0("laplace",i))
Avg_Accuracy <- mean(database [, 1] )
Avg_minError <- mean(database [, 2] )
NBresults <- rbind(NBresults , cbind(Avg_Accuracy, Avg_minError ))
}
row.names(NBresults) <- seq(0.5, 2.5, by = 0.5) # remember to write it here the range of the Laplace values you are exploring!
NBresults
head(NBresults [order(-NBresults $Avg_Accuracy),]) # sorting by "Accuracy"
######################################################
######################################################
# Once you have estimated the best hyperparameters setting for the ML, you could replicate the
# K-fold analysis to see now which is the most advisable ML algorithm (given your training-set)
######################################################
######################################################