rm(list=ls(all=TRUE)) getwd() # setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(cvTools) library(reshape2) library(dplyr) ##################################################### # FIRST STEP: let's prepare the training-set ##################################################### # let's focus on MOVIE reviews (either positive or negative) x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) myCorpusTwitterTrain <- corpus(x) Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE ) # Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_train , 20) # 20 top words train <- as.matrix(Dfm_train) # dense matrix # our classes table(Dfm_train@docvars$Sentiment) # our benchmark: accuracy .524 prop.table(table(Dfm_train@docvars$Sentiment)) ###################################################### ###################################################### # Let's start to explore (let's tune!) the hyperparameters for the RF ###################################################### ###################################################### # The main deafault hyperparameters in the case of a RF are the following ones: "ntree" (Number of trees to grow; default=500), # "mtry" (Number of variables randomly sampled as candidates at each split, where p is number of variables in x; the default is =sqrt(p); # In our case p=length(Dfm_train@Dimnames$features)=1012!) # "nodesize": Minimum size of terminal nodes. This controls the complexity of the trees. # Smaller node size allows for deeper, more complex trees while larger node results in shallower trees. # This is another bias-variance tradeoff where deeper trees introduce more variance (risk of overfitting) and shallower trees # introduce more bias (risk of not fully capturing unique patters and relatonships in the data). The default is nodesize=1 # "maxnodes": maximum number of terminal nodes. Another way to control the complexity of the trees. # More nodes equates to deeper, more complex trees and less nodes result in shallower trees. # sampsize: the number of samples to train on. The default value is 63.25% of the training set since this is the expected value of # unique observations in the bootstrap sample. Lower sample sizes can reduce the training time but may introduce more bias # than necessary. Increasing the sample size can increase performance but at the risk of overfitting because it introduces more # variance. Typically, when tuning this parameter we stay near the 60-80% range. # create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two: # ntree (100 and 200) and mtry (28 and 30) with nodesize fixed at 1 hyper_grid <- expand.grid( nodesize=c(1), ntree=c(100, 200), mtry =c(28, 30), min_error = 0, # a place to dump results accuracy = 0 # a place to dump results ) nrow(hyper_grid) # 4 possibilities by crossing ntree with mtry hyper_grid # if you want to add several values you can write something like: # ntree= seq(100, 300, by = 10) # to tune the RF, let's exmploy the function "tune" # the default in this case is running a 10-fold cross validation # grid search for(i in 1:nrow(hyper_grid)) { # create parameter list params <- list( nodesize= hyper_grid$nodesize[i], ntree= hyper_grid$ntree[i], mtry = hyper_grid$mtry [i] ) set.seed(123) # train model rf.tune <- tune(randomForest, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train, ranges = params, do.trace=TRUE) # add min training error and accuracy to grid hyper_grid$min_error[i] <- min(rf.tune$performances$error) hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i] } # let's see the results head(arrange(hyper_grid, min_error ), 10) ###################################################### ###################################################### # Now let's explore (let's tune!) the hyperparameters for the SVM ###################################################### ###################################################### # Let's stick to the linear kernel. # beyond C ("cost"), another main hypermaters for a linear kernel is "epsilon" (the parameter controlling the width of the -insensitive zone included # in the insensitive-loss function (default: 0.1)) # Accordingly, I can investigate different combination of values for C (default: C=1) as well of epsilon # create hyperparameter grid: you can add as many values and hyperparameters you want. Here just two (cost and epsilon) hyper_grid <- expand.grid( cost = c(0.1, 1), epsilon = c(0.1, 3), min_error = 0, # a place to dump results accuracy = 0 # a place to dump results ) nrow(hyper_grid) # 4 possibilities by crossing cost with epsilon hyper_grid # grid search for(i in 1:nrow(hyper_grid)) { # create parameter list params <- list( cost = hyper_grid$cost[i], epsilon = hyper_grid$epsilon [i] ) set.seed(123) # train model sv.tune <- tune(svm, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train, kernel="linear", ranges = params) hyper_grid$min_error[i] <- min(sv.tune$performances$error) hyper_grid$accuracy[i] <- 1-hyper_grid$min_error[i] } head(arrange(hyper_grid, min_error ), 10) # in this specific case, there is not a lot of change. # Of course changing the values of c and epsilon through which looking for (for example by looking for values of c also >10), can change the final results # for all other kernel (radial, polynomial), you also have "gamma" - a scaling parameter used to fit nonlinear boundaries # (default: 1/(data dimension)) - in our case: 1/length(train) # for polynomial kernel you also have "degree" (default: 3) and "coef0" (default: 0) # Let's explore a radial kernel # to make things faster in the Lab, we let only gamma to change, while fixing the cost and the epsilon value to 1 and 0.3 respectively hyper_grid <- expand.grid( cost = c(1), epsilon = c(0.3), gamma = c(0.001, 0.01, 0.1, 1), min_error = 0 # a place to dump results ) # grid search for(i in 1:nrow(hyper_grid)) { # create parameter list params <- list( cost = hyper_grid$cost[i], epsilon = hyper_grid$epsilon [i], gamma = hyper_grid$gamma [i] ) set.seed(123) # train model sv.tune <- tune(svm, train.y= as.factor(Dfm_train@docvars$Sentiment), train.x=train, kernel="radial", ranges = params) # add min training error and trees to grid hyper_grid$min_error[i] <- min(sv.tune$performances$error) } head(arrange(hyper_grid, min_error ), 10) # best model here has gamma=0.001 when cost=1 and epsilon=0.3; note however how the error for the best model is # the same of the one you get via linear kernel ###################################################### ###################################################### # Now let's explore (let's tune!) the hyperparameters for the NB! ###################################################### ###################################################### # The main hyperparameter is the value of Laplace. # In the NB case you cannot use the "tune" function, so we will do with a different script. A bit more complicated, but # once written, you can always use that! # Let's see an example of changing the value of Laplace from 0.5 to 2.5 by 0.5 # STEP 1: create the folds ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop # let's split our training-set in 10 folds set.seed(123) # set the see for replicability k <- 10 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set for (j in seq(0.5, 2.5, by = 0.5)){ # here you can change the values as you want set.seed(123) newrf <- multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, laplace = j) # (just fit on the train data) and ADD the name of the output (in this case "Sentiment") newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.nb",i, sep = "/", j) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) } } NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # for(i in mget(ls(pattern = "conf.mat.nb")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative<- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive NBPredict <- rbind(NBPredict , cbind(Accuracy , F1_negative, F1_positive)) } str(NBPredict) nrow(NBPredict)/k # number of estimated values for Laplace values <- nrow(NBPredict)/k values # the results in NBPredict are saved like that: first all the results of the first k-fold for all the values of Laplace, and so on. # for example imagine that you have k-fold=5 and Laplace assume just 2 values: 0.5 and 1. Then the first two Accuracy results in NBPredict # are the Accuracy results you get in k-fold=1 for Laplace first 0.5 and then 1; the third and fourth Accuracy results are the # the Accuracy results you get in k-fold=2 for Laplace first 0.5 and then 1; and so on till k-fold=5 for (i in 1:values ) { # generate the list of numbers that correspond to all the k-folds results for each single value of Laplace id <- seq(i,nrow(NBPredict),values) name <- paste0("index",i) assign(name, id) } for(i in mget(ls(pattern = "index")) ) { # extract the k-folds results for each value of Laplace id <- NBPredict [(i), ] name <- paste0("laplace",i) assign(name, id) } # Let's compare the average value for accuracy and f1 NBresults <- data.frame(col1=vector(), col2=vector()) # generate an empty database that you will fill with the average value of # accuracy and F1 for each value of Laplace for (i in 1:values){ database=get(paste0("laplace",i)) Avg_Accuracy <- mean(database [, 1] ) Avg_F1 <- mean(colMeans(database [-1] )) NBresults <- rbind(NBresults , cbind(Avg_Accuracy, Avg_F1)) } row.names(NBresults) <- seq(0.5, 2.5, by = 0.5) # remember to write it here the range of the Laplace values you are exploring! NBresults head(NBresults [order(-NBresults $Avg_F1),]) # sorting by "FI" head(NBresults [order(-NBresults $Avg_Accuracy),]) # sorting by "Accuracy" # You see that there is indeed some change, but usually an hyper-grid search is going to improve not dramatically with respect # to the values you get when running a ML algorithm with its default hyperparamters values ###################################################### ###################################################### # Once you have estimated the best hyperparameters setting for the ML, you could replicate the # K-fold analysis to see now which is the most advisable ML algorithm (given your training-set) ###################################################### ######################################################