rm(list=ls(all=TRUE)) getwd() # setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(naivebayes) library(car) library(ggplot2) library(dplyr) library(reshape2) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### # let's focus on MOVIE reviews (either positive or negative) x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) nrow(x) table(x$Sentiment) prop.table(table(x$Sentiment)) myCorpusTwitterTrain <- corpus(x) Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE) # Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_train , 20) # 20 top words ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE) # NOTE that in this case we also have the "true" value for sentiment in the test-set. Of course usually this wont' happen! # It is a test-set after all! str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) Dfm_test<- dfm(myCorpusTwitterTest , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # after this step, we can "predict" by employing only the features included in the training-set! # this moreover makes the DfM for the test-set smaller if you have a giant test-set compared # to the training-set (i.e., of course this is not the case!) ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) length(Dfm_test@Dimnames$features) length(Dfm_train@Dimnames$features) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) # dense matrix object.size(train) # a compressed sparse matrix! trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix object.size(trainSP ) object.size(train)/object.size(trainSP ) # this is going to make a HUGE difference when you have a large matrix as test and train!!! # we will mainly use normal (i.e., not compressed) matrix in our class given that we will always use small dataset. # But keeps this in mind! test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a ML Model ##################################################### ##################################################### # Let's start with a Naive Bayes Model ##################################################### # we will use the naivebayes package. Another possibile package you can consider: the fastNaiveBayes package # given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that your # features can assume a value different than just 0/1, i.e., a one-hot-encoding. And in fact: table(Dfm_train@x ) # to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command # our DV str(Dfm_train@docvars$Sentiment) # that's a character variable! Not good! So we will use as.factor to transorm it into a factor variable set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$Sentiment), laplace = 1)) summary(NB22) prop.table(table(Dfm_train@docvars$Sentiment)) # prior probabilities # let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates). # take a look at "family" and "comedy". The former increases the probability of a positive review compared to the latter one head(NB22$params) # let's investigate about this issue a bit more NB_prob <- as.data.frame(NB22$params) NB_prob$Feature <- row.names(NB_prob) str(NB_prob) # the features that change the most the difference between the positive and negative conditional probabilities NB_prob$diff <- NB_prob$pos-NB_prob$neg str(NB_prob) print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # positive words print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # negative words NB_prob$sign <- ifelse(NB_prob$diff>0,"positive","negative") str(NB_prob) # let's extract the top 20-most positive and the 20-most negative contributing features NB_prob2 <- top_n(NB_prob, 20, diff ) NB_prob2 NB_prob3 <- top_n(NB_prob, -20, diff ) NB_prob3 NB_prob_new <- rbind(NB_prob2, NB_prob3) # reorder the features NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff)) ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) + geom_col(show.legend = F) + coord_flip() + ylab("Difference in the conditional probabilities") + scale_fill_manual(values = c("orange", "blue")) + labs(title = "Movie Reviews", subtitle = "Negative (-) versus Positive (+) words - NB") # let's FINALLY predict the test-set predicted_nb <- predict(NB22 ,test ) table(predicted_nb ) prop.table(table(predicted_nb )) ##################################################### # let's run a Random Forest ##################################################### # we will use the randomForest package. If you are employing compressed matrices, plz use the ranger package. # the ranger package is also usually faster than the randomForest one # note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters. # More on this later on set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE, do.trace=TRUE, ntree=500)) RF # why 32 for the no. of variables randomly tried at each split? That's another hyperparameter of a RF # The default is mtry=sqrt(p), where p is number of variables in x. In our case x=length(Dfm_train@Dimnames$features)= 1058! # therefore sqrt(length(Dfm_train@Dimnames$features))=32.5) # A natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample # that provides an efficient and reasonable approximation of the test error. This provides a built-in validation set without # any extra work on your part, and you do not need to sacrifice any of your training data to use for validation. # This makes identifying the number of trees required to stablize the error rate during tuning more efficient; # however, REMEMBER: this is less efficient than changing several hyperparameters and doing k-fold as we will see later on! RF # why OOB=20.6%? This estimate is calculated by counting however many points in the training set were misclassified # (67 negative and 36 positive = 103) and dividing this number by the total number of observations (103/500 = 20.6%) # We will discuss at great lenght about the Confusion matrix next week! plot(RF) # shows: the errores for the two classes (red=negative; green=positive) and in the black the average str(RF$err.rate) error <- as.data.frame(RF$err.rate) str(error) # number of trees with lowest OOB which.min(error$OOB) # 432 set.seed(123) system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE, ntree=432, do.trace=TRUE)) RF2 # What about the importance of each feature for our trained model? # Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides # to the model for predicting the outcome head(RF$importance[,3:4]) # let's grap the result varImpPlot(RF ) # Each features’s importance is assessed based on two criteria: # -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set. # Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance, # but would make the second variable more important. # -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be? # Highest purity means that each node contains only elements of a single class. # Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly. # Plz note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded. # The problem is that we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes. # which are however the most important words for the positive label? and for the negative one? # let's extract the matrix for GINI and Accuracy importance_RF <- as.data.frame(RF$importance[,3:4]) str(importance_RF) importance_RF$Feature<- row.names(importance_RF) str(importance_RF) # same words we get with varImpPlot(RF ) print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),])) # let's predict the training-set and let's store a new variable in our dfm of the training-set with such predictions predicted_rf <- predict(RF, train, type="class") table(predicted_rf ) Dfm_train@docvars$predRF <- ifelse(predicted_rf=="neg",0,1) # NOTE: perfect prediction of the training-set! table(Dfm_train@docvars$Sentiment, Dfm_train@docvars$predRF) # adding sign [if 0/1 according to the content of the review - neg or pos] sums <- list() for (v in 0:1){ sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,]) } # let's assign a word to pos/neg according to its frequency (if for example the word "bad" appears # 10 times among prediceted negative reviews and 5 times among predicted positive reviews, # we classify it as pertaining to he "negative reviews" world; and so on) sums <- do.call(cbind, sums) sign <- apply(sums, 1, which.max) # get the feature names <- dimnames(train)[[2]] str(names) df <- data.frame( Feature = names, sign = sign-1, stringsAsFactors=F) str(df) importance <- merge(importance_RF, df, by="Feature") str(importance) ## best predictors for (v in 0:1){ cat("\n\n") cat("value==", v) importance <- importance[order(importance$MeanDecreaseGini, decreasing=TRUE),] print(head(importance[importance$sign==v,], n=10)) cat("\n") cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", ")) } # let's draw a graph with our results! # first of all let's recode as negative values the MeanDecreaseGini for the negative features importance$Gini <- ifelse(importance$sign>0,importance$MeanDecreaseGini,importance$MeanDecreaseGini*-1) # the twop 20 positive words importance2 <- top_n(importance, 20, Gini ) importance2 # the twop 20 negative words importance3 <- top_n(importance, -20, Gini ) importance3 importance_new <- rbind(importance2, importance3) str(importance_new) # reorder the features importance_new <- mutate(importance_new, Feature= reorder(Feature, Gini )) importance_new$sign2<- ifelse(importance_new$sign>0,"positive","negative") importance_new <- mutate(importance_new, Feature= reorder(Feature, Gini )) ggplot(importance_new, aes(Feature, Gini , fill = sign2)) + geom_col(show.legend = F) + coord_flip() + ylab("Mean Decrease Gini (we recoded as negative the values for the Negative features)") + scale_fill_manual(values = c("orange", "blue")) + labs(title = "Movie Reviews", subtitle = "Negative (-) versus Positive (+) features - RF") #### and with 3 categories? try to find it out by yourself! # let's FINALLY predict the test-set system.time(predicted_rf <- predict(RF, test,type="class")) table(predicted_rf ) prop.table(table(predicted_rf)) ##################################################### # let's run a SVM ##################################################### # note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data) # and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on set.seed(123)# (define a set.seed for being able to replicate the results!) system.time(SV <- svm(y= as.factor(Dfm_train@docvars$Sentiment), x=train, kernel='linear', cost = 1)) # how many supporting vectors? length(SV$index) nrow(train) # 352 out of 500 texts in the train data frame head(SV$coefs) # The coefficients that you're pulling out are the weights for the support vectors. # Looking at the estimated coefficients is not as informative because they only tell us what support vectors were estimated in the model. # But we can have a sense of what observations are more “important” or “separate” better the data by extracting the support vectors # in the data matrix and then their corresponding coefficients (times the training labels): # let's predict the training-set and let's store a new variable in our dfm of the training-set with such predictions predicted_sv <- predict(SV, train, type="class") table(predicted_sv ) Dfm_train@docvars$predSV <- ifelse(predicted_sv=="neg",0,1) # Once again perfect prediction of the training-set! table(Dfm_train@docvars$Sentiment, Dfm_train@docvars$predSV) # let's identify the 343 vectors (and their corresponding texts!) str(x) df <- data.frame( vector = x$text[SV$index], coef = SV$coefs, sentiment = predicted_sv[SV$index], stringsAsFactors = F ) str(df) str(SV) # take a look at "decision.values". You can read "neg/pos" # A positive coeff. means in this case a "negative" review (the numerator) and viceversa # negative rewiew (and positive value) df <- df[order(df$coef, decreasing=TRUE),] head(df[,c("coef", "sentiment", "vector")], n=10) # positive review (and negative value) df <- df[order(df$coef),] head(df[,c("coef", "sentiment", "vector")], n=10) # compute feature importance matrix str(SV$coefs) # coefficients for the vectors str(SV$SV) # the scaled coordinates of the vectors (for each feature) dtb <- as.data.frame(SV$SV) str(dtb) W <- t(SV$coefs) %*% SV$SV # let's multiply the two set of coefficients str(W) W <- t(W) str(W) # The so estimated weights indicate what features best predict the source (in our case the type of movie # review: either positive or negative); features that are most “telling” of the source/type of review. W <- as.data.frame(W) W$Feature <- row.names(W) names(W)[1] <- "weights" str(W) # let's recode the feature for the Positive class as positive values and viceversa for the Negative class W$weights <- W$weights*-1 W$sign <- ifelse(W$weights>0,"positive","negative") print(head(W[order(W$weights, decreasing=TRUE),], 20)) # positive words print(head(W[order(W$weights, decreasing=FALSE),], 20)) # negative words # let's draw a graph! W2 <- top_n(W, 20, weights) str(W2) W3 <- top_n(W, -20, weights) str(W3) W3 W_new <- rbind(W2, W3) str(W_new) # reorder the features W_new <- mutate(W_new, Feature= reorder(Feature, weights)) ggplot(W_new, aes(Feature, weights, fill = sign)) + geom_col(show.legend = F) + coord_flip() + ylab("weights") + scale_fill_manual(values = c("orange", "blue")) + labs(title = "Movie Reviews", subtitle = "Negative (-) versus Positive (+) words - SVM") # let's FINALLY predict the test-set system.time(predicted_svm <- predict(SV , test)) table(predicted_svm ) prop.table(table(predicted_svm )) ###################################################### ###################################################### # Let's compare the results out-of-sample we got via Naive Bayes, SVM & RF ###################################################### ###################################################### prop.table(table(predicted_nb )) prop.table(table(predicted_svm )) prop.table(table(predicted_rf )) results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )), prop.table(table(predicted_svm )))) str(results) results$algorithm <- c("NB", "RF", "SVM") str(results) # Let's plot the results! df.long<-melt(results,id.vars=c("algorithm")) str(df.long) ggplot(df.long,aes(algorithm,value,fill=variable))+ geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() + ylab(label="Sentiment class in the test-set") + xlab("algorithm") # and so, which result is to trust more than the other one? # In our case we have the "true" reviews value! So we can say that! However, this is just an exception of course! # The test-set is BY DEFINITION always unlabeled! # how can we compare the true value with the predicted ones? # let's estimate the MAE - mean average error (just to have a rough idea of the algorithms' performance). # HOWEVER remember: this is NOT the MOST appropriate way to measure (and compare) the performance of ML algorithms that estimate individual classification of texts. # A much better alternative (as we will discuss next week) is computing a Confusion matrix, and the accuracy/F1 statistics (more on that next week!) prop.table(table(x10$Sentiment)) mae <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )), prop.table(table(predicted_svm )), prop.table(table(x10$Sentiment)))) str(mae) mae$algorithm <- c("NB", "RF", "SVM", "TRUE") mae mae_NB <- (mean(abs(mae[1,1]-mae[4,1]))+mean(abs(mae[1,2]-mae[4,2])))/2 mae_RF <- (mean(abs(mae[2,1]-mae[4,1]))+mean(abs(mae[2,2]-mae[4,2])))/2 mae_SV <- (mean(abs(mae[3,1]-mae[4,1]))+mean(abs(mae[3,2]-mae[4,2])))/2 # NB appears to do better, followed by SVM and RF mae_NB mae_RF mae_SV # We will be back to this point in the next class when we will discuss about Cross-Validation and ML!