rm(list=ls(all=TRUE)) getwd() # setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(cvTools) library(reshape2) library(dplyr) ##################################################### # let's prepare the training-set with 3 categories (this script works fine for any number of categories>2) ##################################################### uk_train <- read.csv("uk_train.csv") str(uk_train) myCorpusTwitterTrain <- corpus(uk_train) Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE ) # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) topfeatures(Dfm_train , 20) # 20 top words train <- as.matrix(Dfm_train) # our classes table(Dfm_train@docvars$Sentiment) # our benchmark: accuracy .608 prop.table(table(Dfm_train@docvars$Sentiment)) ###################################################### ###################################################### # which main changes? Compared to the script "Lab 8 part 1" # consider the case of a SVM - but that applies to all the other scripts ###################################################### ###################################################### # STEP 1: create the 5 folds ttrain <- train set.seed(123) # set the see for replicability k <- 5 folds <- cvFolds(NROW(ttrain ), K=k) str(folds) # STEP 2: the LOOP system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set set.seed(123) newrf <- svm(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, kernel='linear', cost = 1) # Get your new model # (just fit on the train data) and ADD the name of the output (in this case "Sentiment") newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.sv",i) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) }) # STEP 3: the metrics SVMPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) ##### FIRST CHANGE # Why 4 columns NOW? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive. # According to the number of classes in your output variable, changes the number of columns to fill!!! for(i in mget(ls(pattern = "conf.mat.sv")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value ##### SECOND CHANGE: the following 4 lines; p <- as.data.frame((i)$byClass) F1_negative <- p$F1[1] # save in the matrix the F1 value for negative F1_neutral <- p$F1[2] # save in the matrix the F1 value for neutral F1_positive <- p$F1[3] # save in the matrix the F1 value for positive SVMPredict <- rbind(SVMPredict , cbind(Accuracy , F1_negative , F1_neutral, F1_positive )) } SVMPredict [is.na(SVMPredict )] <- 0 # if I get some NA for some categories with respect to F1 (this happens when BOTH precision and recall score for that category is 0), replace NA with 0 str(SVMPredict ) # you see that we are not doing that well with the class "negative" # Let's compare the average value for accuracy and f1 acc_sv_avg <- mean(SVMPredict[, 1] ) f1_sv_avg <- mean(colMeans(SVMPredict[-1] )) acc_sv_avg f1_sv_avg # you see here that we do not improve that much compared to our benchmark model # moreover there is a wide gap between accuracy and the avg. value of F1: why? cause we are doing good with some class (neutral) # and bad with the others. Why? The presence of an imbalanced dataset could be a reason for that. So here it does not matter # the type of fancy ML algorithm (and connceted hyper-parameters mix) you are employing. # The only way for you to improve the performance of a ML algorithm in the CV stage (before predicting the test-set) is going back # to the training-set and improving it (for example by adding more texts displaying the classes in which you are doing bad). # Any ML algorithm on a poorly built training-set, is going always to be a bad ML algorithm in terms of performance ###################################################### ###################################################### # which main changes? Compared to the script "Lab 8 part 2" # the second change is just related to the NB loop (but you should already know about it!) ###################################################### ###################################################### uk_train <- read.csv("uk_train.csv") str(uk_train) myCorpusTwitterTrain <- corpus(uk_train) Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE ) # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) topfeatures(Dfm_train , 20) # 20 top words train <- as.matrix(Dfm_train) # STEP 1: create the folds ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop # let's split our training-set in 10 folds set.seed(123) # set the see for replicability k <- 10 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set for (j in seq(0.1, 2.5, by = 0.1)){ # here you can change the values as you want set.seed(123) newrf <- multinomial_naive_bayes(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment) ,x=train, laplace = j) # (just fit on the train data) and ADD the name of the output (in this case "Sentiment") newpred <- predict(newrf,newdata=validation) # Get the predicitons for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.nb",i, sep = "/", j) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) } } NBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector(), col4=vector()) ##### FIRST CHANGE # Why 4 columns NOW? 1 for accuracy; and 3 for the K1 value of the classes in the Sentiment: negative, neutral, positive. # According to the number of classes in your output variable, changes the number of columns to fill!!! for(i in mget(ls(pattern = "conf.mat.nb")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value ##### SECOND CHANGE: the following 4 lines; p <- as.data.frame((i)$byClass) F1_negative <- p$F1[1] # save in the matrix the F1 value for negative F1_neutral <- p$F1[2] # save in the matrix the F1 value for neutral F1_positive <- p$F1[3] # save in the matrix the F1 value for positive NBPredict <- rbind(NBPredict , cbind(Accuracy , F1_negative, F1_neutral, F1_positive)) } NBPredict[is.na(NBPredict)] <- 0 # if I get some NA for some categories with respect to F1 (this happens when BOTH precision and recall score for that category is 0), replace NA with 0 str(NBPredict) nrow(NBPredict)/k # number of estimated values for Laplace values <- nrow(NBPredict)/k values # the results in NBPredict are saved like that: first all the results of the first k-fold for all the values of Laplace, and so on. # for example imagine that you have k-fold=5 and Laplace assume just 2 values: 0.5 and 1. Then the first two Accuracy results in NBPredict # are the Accuracy results you get in k-fold=1 for Laplace first 0.5 and then 1; the third and fourth Accuracy results are the # the Accuracy results you get in k-fold=2 for Laplace first 0.5 and then 1; and so on till k-fold=5 for (i in 1:values ) { # generate the list of numbers that correspond to all the k-folds results for each single value of Laplace id <- seq(i,nrow(NBPredict),values) name <- paste0("index",i) assign(name, id) } for(i in mget(ls(pattern = "index")) ) { # extract the k-folds results for each value of Laplace id <- NBPredict [(i), ] name <- paste0("laplace",i) assign(name, id) } # Let's compare the average value for accuracy and f1 NBresults <- data.frame(col1=vector(), col2=vector()) # generate an empty database that you will fill with the average value of # accuracy and F1 for each value of Laplace for (i in 1:values){ database=get(paste0("laplace",i)) Avg_Accuracy <- mean(database [, 1] ) Avg_F1<- mean(colMeans(database [-1] )) NBresults <- rbind(NBresults , cbind(Avg_Accuracy, Avg_F1)) } row.names(NBresults) <- seq(0.1, 2.5, by = 0.1) # remember to write it here the range of the Laplace values you are exploring! NBresults head(NBresults [order(-NBresults $Avg_F1),]) # sorting by "FI" head(NBresults [order(-NBresults $Avg_Accuracy),]) # sorting by "Accuracy"