rm(list=ls(all=TRUE)) setwd("C:/Users/luigi/Dropbox/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(caret) library(car) library(ggplot2) library(dplyr) library(reshape2) library(glmnet) ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### # let's focus on MOVIE reviews (either positive or negative) x <- read.csv("train_review2.csv", stringsAsFactors=FALSE) str(x) nrow(x) table(x$Sentiment) prop.table(table(x$Sentiment)) myCorpusTwitterTrain <- corpus(x) tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Dfm_train <- dfm(tok2) # Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_train , 20) # 20 top words ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### x10 <- read.csv("test_review3.csv", stringsAsFactors=FALSE) str(x10) nrow(x10) myCorpusTwitterTest <- corpus(x10) tok2 <- tokens(myCorpusTwitterTest, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) tok2 <- tokens_wordstem (tok2) Dfm_test <- dfm(tok2) Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop") topfeatures(Dfm_test , 20) # 20 top words ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # after this step, we can "predict" by employing only the features included in the training-set! # this moreover makes the DfM for the test-set smaller if you have a giant test-set compared # to the training-set (i.e., of course this is not the case!) ##################################################### setequal(featnames(Dfm_train), featnames(Dfm_test)) length(Dfm_test@Dimnames$features) length(Dfm_train@Dimnames$features) test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) setequal(featnames(Dfm_train), featnames(test_dfm )) ##################################################### # FOURTH STEP: Let's convert the DfM into matrices for the ML algorithms to work ##################################################### train <- as.matrix(Dfm_train) test <- as.matrix(test_dfm) ##################################################### # FIFHT STEP: let's estimate a RR Model ##################################################### ##################################################### # Let's start with a Regularized regression - RIDGE model ##################################################### # Ridge regression requires that you specify "alpha=0" # As a family here you identify "binomial" cause your output variable has 2 categories (negative, positive) # if it were a 3 or more categories, you HAVE to use the family "multinomial". # Moreover if you run a "multinomial" you have also to select type.multinomial either grouped or ungrouped (i.e., type.multinomial=c("ungrouped")) # The main difference is that with "ungrouped" you are estimating a lasso penalty for each feature according to the class-labe (for example, a different lasso penalty for the word "fair" # when you are dealing with the positive, negative or neutral class-label respectively). When type.multinomial=c("grouped") you have one single grouped-lasso penalty for each feature # irrespective of the class-label. As a result type.multinomial can be considered as a tuning parameter in a multinomial RR # To identify the value of lambda, glmnet by default estimates 100 different lambdas (but you can control the number via the command: nlambda=100) system.time(ROLS <- glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train, family="binomial", alpha=0, trace.it=1)) str(ROLS) # The number at the top of the plot (1139) just refers to the number of variables (features in our case) in the model. # Ridge regression does not force any variables to exactly zero so all features will remain in the model. # s lambda increases, the penalty becomes large and forces the coefficients to a value closer (but NOT exactly equal to) zero plot(ROLS, xvar = "lambda") # let's predict the test-set system.time(predicted_rr <- predict(ROLS, test,type="class")) # why you get 100,000 predictions? cause it is 1000 texts in the test-set * 100 Lambda estimated by default! table(predicted_rr ) # So better always use cv.glmnet (i.e., running a CV on glmnet) and extract the optimal lasso$lambda.min set.seed(123) system.time(ridge <- cv.glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train, family="binomial", alpha=0, nfolds=10, type.measure="class", trace.it=1 )) plot(ridge) # The plot output illustrates the 10-fold CV minimum miss-classification error across the lambda values. # As we constrain our coefficients with log of lambda>2 penalty, the miss-classification error rises considerably. # The first and second vertical dashed lines represent the lambda value with the minimum miss-classification error and the largest # lambra value within one standard error of the minimum miss-classification error min(ridge $cvm) # minimum miss-classification error (i.e., 1-accuracy) ridge $lambda.min # lambda for this minimum log(ridge $lambda.min ) ridge $cvm[ridge $lambda == ridge $lambda.1se] # 1 st.error of minimum miss-classification error ridge $lambda.1se # lambda for this error log(ridge$lambda.1se) plot(ridge) lse <- as.numeric(ridge$lambda.1se) # let's save the lambda in this latter case lse # Once you have identified your preferred lambda, you can simply use the command "predict" to predict the same model on a new data set. # The only caveat is that you need to supply to predict an s parameter with the preferred models lambda value # Let's go back to our previous ROLS object plot(ROLS, xvar = "lambda") abline(v = log(ridge $lambda.min), col = "red", lty = "dashed") abline(v = log(ridge $lambda.1se), col = "blue", lty = "dashed") # these are the results when you use lambda.min system.time(predicted_glment <- predict(ROLS, test, s = ridge $lambda.min, type="class")) table(predicted_glment) prop.table(table(predicted_glment)) # same results you get if you predict with the CV for ridge above system.time(predicted_rr2 <- predict(ridge , s= ridge $lambda.min, test,type="class")) table(predicted_rr2 ) prop.table(table(predicted_rr2)) # these are the results when you use lambda.1se system.time(predicted_glment2 <- predict(ROLS, test, s = lse, type="class")) table(predicted_glment2) prop.table(table(predicted_glment2)) # Why should you ever want to use lambda.lse rather than lambda.min? # The advantage of identifying the lambda with a miss-classification error within one standard error from the minimum # becomes more obvious with the lasso and elastic net models (see below!) # Let's do some Global Interpretation for our RR! newROLS <- glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train, family="binomial", alpha=0, trace.it=1, lambda=ridge $lambda.min) str(newROLS) # here just two classes in our DV: therefore, a positive coeff. means that it increases the probability of sentiment=pos; # the oppositive if you have a negative coefficient # let's create a dataframe with the features and their coefficients coeff_df <- as.data.frame(Dfm_train@Dimnames$features) str(coeff_df) names(coeff_df)[1] <- "Feature" coeff_df$coeff <- newROLS$beta@x str(coeff_df) # top 20-positive coefficients top_n(coeff_df, 20, coeff ) importance2RR <- top_n(coeff_df, 20, coeff) # top 20-negative coefficients top_n(coeff_df, -20, coeff ) importance3RR <-top_n(coeff_df, -20, coeff ) importance_newRR <- rbind(importance2RR, importance3RR) str(importance_newRR) # reorder the features importance_newRR <- mutate(importance_newRR, Feature= reorder(Feature, coeff )) importance_newRR$sign3<- ifelse(importance_newRR$coeff>0,"Positive","Negative") ggplot(importance_newRR, aes(Feature, coeff, fill = sign3)) + geom_bar(stat="identity", fill= "white") + theme_classic()+ geom_col(show.legend = T) + theme(legend.position="bottom")+ coord_flip() + ylab("Impact on classification (coefficients)") + scale_fill_manual(values = c("#000033", "#006699")) + labs(title = "Moview Reviews", subtitle = "Negative (-) versus Positive (+) words - RR Ridge Model", fill = "Movie Reviews") ###################################################### ###################################################### # Let's now move to a lasso regression model ###################################################### ###################################################### # lasso regression requires that you specify "alpha=1" set.seed(123) system.time(lasso <- cv.glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train, family="binomial", alpha=1, nfolds=10, type.measure="class", trace.it=1 )) plot(lasso ) min(lasso $cvm) # minimum miss-classification error of the lasso regression min(ridge $cvm) # minimum miss-classification error of the ridge regression lasso $lambda.min # lambda for this value log(lasso $lambda.min) lasso $lambda.1se log(lasso $lambda.1se) # Now the advantage of identifying the lambda with a miss-classification error within one standard error becomes more obvious. # If we use the lambda that drives the minimum miss-classification error we can reduce our feature set to less than 130. # However, there will be some variability with this miss-classification error and we can reasonably assume that we can achieve # a similar miss-classification error with a slightly more constrained model that uses less than 50 features. # If describing and interpreting the predictors is an important outcome of your analysis, this may significantly aid your endeavor. # This is not necessarily the case while doing text analytics with lots of features...still, that's up to you as a decision # Note that whereas the ridge regression approach pushes variables to approximately but not equal to zero, the lasso penalty will actually # push coefficients to zero system.time(ROLS_lasso <- glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train, family="binomial", alpha=1, trace.it=1)) # as you can see as lambda increases, the penalty becomes large and forces our coefficients to zero plot(ROLS_lasso, xvar = "lambda") # with lasso plot(ROLS, xvar = "lambda") # with ridge ###################################################### ###################################################### # Let's now move to elastic nets regression model ###################################################### ###################################################### # Any alpha value between 0-1 will perform an elastic net. # When alpha = 0.5 we perform an equal combination of penalties whereas alpha closer 0 # will have a heavier ridge penalty applied and alpha closer 1 will have a heavier lasso penalty. # Ideally you could look for the optimal values of both lambda AND alpha. In the below example, however, # we have fixed alpha = 0.5. Then cv.glmnet will not cross-validate across values of alpha set.seed(123) system.time(elastic<- cv.glmnet(y= as.factor(Dfm_train@docvars$Sentiment), x=train, family="binomial", alpha=0.5, nfolds=10, trace.it=1, type.measure="class")) plot(elastic) # note the difference (in the number of features) with... plot(lasso ) min(elastic$cvm) # minimum miss-classification error of the elastic nets regression model elastic$lambda.min # lambda for this value log(elastic$lambda.min) elastic$lambda.1se log(elastic$lambda.1se) # it seems that the ridge model is the one performing better in this case min(ridge $cvm) # minimum miss-classification error of the ridge regression min(lasso $cvm) # minimum miss-classification error of the lasso regression min(elastic$cvm) # minimum miss-classification error of the elastic nets regression model ###################################################### ###################################################### # Let's run a 5-folds CV with a Ridge Regression (to compare it with other ML algorithms for example) ###################################################### ###################################################### # STEP 1: create the 5 folds library(cvTools) ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop # let's split our training-set in 5 folds set.seed(123) # set the see for replicability k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work! folds <- cvFolds(NROW(ttrain ), K=k) str(folds) system.time(for(i in 1:k){ train <- ttrain [folds$subsets[folds$which != i], ] # Set the training set validation <- ttrain [folds$subsets[folds$which == i], ] # Set the validation set set.seed(123) newrf <- cv.glmnet(y= as.factor(Dfm_train[folds$subsets[folds$which != i], ]@docvars$Sentiment), x=train, family="binomial", alpha=0, nfolds=10, type.measure="class", trace.it=1 ) # Get your new model (just fit on the train data) and ADD the name of the output (in this case "Sentiment") newpred <- predict(newrf,validation, s = newrf $lambda.min, type="class") # Get the predicitons for the validation set (from the model just fit on the train data) class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$Sentiment) print(class_table) df<-confusionMatrix( class_table, mode = "everything") df.name<-paste0("conf.mat.rr",i) # create the name for the object that will save the confusion matrix for each loop (=5) assign(df.name,df) }) # STEP 3: the metrics ls() # we have created 5 objects that have saved the 5 confusion matrices we have created. I can estimate now the performance metrics on such results # for example: conf.mat.rr1 RRPredict <- data.frame(col1=vector(), col2=vector(), col3=vector()) # makes a blank data frame with three columns to fill with the predictions; # why 3 columns? 1 for accuracy; and 2 for the K1 value of the classes in the Sentiment (given you have just two classes!) for(i in mget(ls(pattern = "conf.mat.rr")) ) { Accuracy <-(i)$overall[1] # save in the matrix the accuracy value F1_negative<- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive RRPredict <- rbind(RRPredict , cbind(Accuracy , F1_negative, F1_positive )) } str(RRPredict ) # Let's compare the average value for accuracy and f1 acc_rr_avg <- mean(RRPredict[, 1] ) f1_rr_avg <- mean(colMeans(RRPredict[-1] )) acc_rr_avg f1_rr_avg