rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(caret) library(car) library(xgboost) library(ggplot2) library(dplyr) library(reshape) library(Ckmeans.1d.dp) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### # let's focus on MOVIE reviews (either positive or negative) x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) nrow(x) table(x$Sentiment) prop.table(table(x$Sentiment)) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_train , 20) # 20 top words ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) tok2 <- tokens(myCorpusTwitterTest, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Dfm_test <- dfm(tok2) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # after this step, we can "predict" by employing only the features included in the training-set! # this moreover makes the DfM for the test-set smaller if you have a giant test-set compared # to the training-set (i.e., of course this is not the case!) ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) length(Dfm_test@Dimnames$features) length(Dfm_train@Dimnames$features) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the DfM into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### # let's run a Gradient Boosting ##################################################### # You need always to add the number of classes to be classified in the formula if it is a multi-categorical variable. # In the present case, we have just two classes (negative & positive) so we do not care. Other situations could be # different (as we will see) numberOfClasses <- length(unique(Dfm_train@docvars$Sentiment)) numberOfClasses # you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable num <- as.factor(Dfm_train@docvars$Sentiment) num table(num) num <- as.numeric(num) num num[ num ==1 ] <-0 num[ num ==2 ] <-1 table(num) # let's add back thie variable to our Dfm and let's call it "code" Dfm_train@docvars$code <- num str(Dfm_train@docvars) # same results! Good! table(Dfm_train@docvars$code) table(Dfm_train@docvars$Sentiment) # If you have just 2 classes (positive/negative) as in our example, you must write "objective = "binary:logistic". # Note: logistic regression for binary classification returns predicted probability to belong to each given class (not the predicted class)! # If you have more than 2 classes in your DV, you have to write: "objective = "multi:softmax"" # This is a multiclass classification using the softmax objective, and it returns predicted class (not probabilities). # As discussed above, you also need in this case to set an additional num_class (number of classes) parameter defining the number # of unique classes. # Alternatively you can use "multi:softprob" – same as softmax, but it returns predicted probability of each data point belonging to each class # There are several hyperparameters in a GB as we have discussed (back to this when we discuss about the grid-search) # Here, I select a specific value for the number of trees ("nrounds") as well as for "eta". # NOTE: "nthread" (Number of threads) implements a parallelization in your computer. And this makes everything faster! set.seed(123) system.time(xgb.fit1 <- xgboost( data = train, label = Dfm_train@docvars$code, nrounds = 500, eta = 1, nthread = 4, objective = "binary:logistic", # for binary eval_metric = "error", # binary classification error rate verbose = 1 # not silent; if you want it silent "verbose=0" )) # Let's do some Global Interpretation for our Gradient Boosting model! # compute feature importance matrix importance_matrix <- xgb.importance(model = xgb.fit1) head(importance_matrix) # Gain: the relative contribution of the corresponding feature to the model calculated by taking each feature’s contribution # for each tree in the model. More in details: at each split in each tree, xgboost computes the improvement in the split-criterion (i.e., the reduction in # the classification error). Then it averages the improvement made by each variable across all the trees that the variable is used. # The variables with the largest average decrease in the classification errors are considered most important. # Cover: the relative number of observations related to this feature. For example, you have 100 observations, 4 features and 3 trees, # and suppose feature1 is used to decide the leaf node for 10, 5, and 2 observations in tree1, tree2 and tree3 respectively; # then the metric will count cover for this feature as 10+5+2 = 17 observations. This will be calculated for all the 4 features and the cover will be # 17 expressed as a percentage for all features’ cover metrics. # Frequency: the percentage representing the relative number of times a particular feature occurs in the trees of the model. # In the above example, if feature1 occurred in 2 splits, 1 split and 3 splits in each of tree1, tree2 and tree3; then the weightage for feature1 # will be 2+1+3 = 6. The frequency for feature1 is calculated as its percentage weight over weights of all features. # let's focus on GAIN (quite common...) importance <- importance_matrix[order(importance_matrix$Gain, decreasing=TRUE),] head(importance, n=20) # the problem here is that we just get the GAIN statistics w/o differentiating the words most important for the classes # for example: which are the most important words for the positive label? and for the negative one? # so let's improve on that following the same path that we followed with RF and SVM! predicted_xgb <- round(predict(xgb.fit1, train)) table(predicted_xgb) Dfm_train@docvars$predXGB <- predicted_xgb # adding sign [if 0/1 according to the content of the review - neg or pos] sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predXGB"]==v,]) } # let's assign a word to pos/neg according to its freqency (if for example the word "bad" appears # 10 times among prediceted negative reviews and 5 times among predicted positive reviews, # we classify it as pertaining to he "negative reviews" world; and so on) sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) str(sign) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( Feature = names, sign = sign-1, stringsAsFactors=F) str(df) importance <- merge(importance_matrix, df, by="Feature") str(importance) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importance <- importance[order(importance$Gain, decreasing=TRUE),] print(head(importance[importance$sign==v,], n=10)) cat("\n") cat(paste(unique(head(importance$Feature[importance$sign==v], n=10)), collapse=", ")) } # Let's draw a graph! str(importance) importance$Gain_OK <- ifelse(importance$sign>0,importance$Gain,importance$Gain*-1) importance2 <- top_n(importance, 20, Gain_OK) importance2 importance3 <- top_n(importance, -20, Gain_OK) importance3 importance_new <- rbind(importance2, importance3) str(importance_new) # reorder the features importance_new <- mutate(importance_new, Feature= reorder(Feature, Gain_OK)) importance_new$sign2<- ifelse(importance_new$sign>0,"positive","negative") importance_new <- mutate(importance_new, Feature= reorder(Feature, Gain_OK)) ggplot(importance_new, aes(Feature, Gain_OK, fill = sign2)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Gain coefficient (we recoded as negative the values for the Negative features)") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Movie Reviews", subtitle = "Negative (-) versus Positive (+) words - XGB Model", fill = "Movie Reviews") # let's FINALLY predict the test data predicted_xgb <- predict(xgb.fit1, test) table(predicted_xgb) # what's that??? Remember: logistic regression for binary classification in XGB returns predicted probability (not class). # Therefore let's use "round" predicted_xgb <- round(predict(xgb.fit1, test)) table(predicted_xgb) prop.table(table(predicted_xgb)) ###################################################### ###################################################### # Let's make CROSS-VALIDATION with k-fold=5 with a loop! # HERE you HAVE JUST 2 classes for the outcome (negative/positive) ###################################################### ###################################################### # Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command. # But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret # STEP 1: create the 5 folds ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop # let's split our training-set in 5 folds set.seed(123) # set the see for replicability k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] validation <- ttrain [folds$subsets[folds$which == i], ] set.seed(123) newrf <- xgboost( data = train, label =Dfm_train[folds$subsets[folds$which != i], ]@docvars$code, nrounds = 500, eta = 1, nthread = 4, objective = "binary:logistic", # for binary eval_metric = "error", # binary classification error rate verbose = 0) # here "code" NOT "Sentiment"!!! newpred <- predict(newrf,newdata=validation, type="class") newpred <- round( newpred) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$code) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.xgb",i) assign(df.name,df) }) # STEP 3: the metrics XGBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) for(i in mget(ls(pattern = "conf.mat.xgb")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value p <- as.data.frame((i)$byClass) F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive XGBPredict <- rbind(XGBPredict , cbind(Accuracy , F1_negative, F1_positive)) } XGBPredict [is.na(XGBPredict )] <- 0 str(XGBPredict ) # Let's compare the average value for accuracy and f1 acc_xgb_avg <- mean(XGBPredict [, 1] ) f1_xgb_avg <- mean(colMeans(XGBPredict [-1] )) acc_xgb_avg f1_xgb_avg