rm(list=ls(all=TRUE))
getwd()
# setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(e1071)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################
# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
nrow(x)
table(x$Sentiment)
prop.table(table(x$Sentiment))
myCorpusTwitterTrain <- corpus(x)
Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE,
tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20) # 20 top words
#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################
x10 <- read.csv("test_review2.csv", stringsAsFactors=FALSE)
# NOTE that in this case we also have the "true" value for sentiment in the test-set. Of course usually this wont' happen!
# It is a test-set after all!
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
Dfm_test<- dfm(myCorpusTwitterTest , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE,
tolower = TRUE, remove_symbols=TRUE, remove_separators=TRUE, remove_url = TRUE, split_hyphens = TRUE)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_test , 20) # 20 top words
#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set!
# this moreover makes the DfM for the test-set smaller if you have a giant test-set compared
# to the training-set (i.e., of course this is not the case!)
#####################################################
setequal(featnames(Dfm_train), featnames(Dfm_test))
length(Dfm_test@Dimnames$features)
length(Dfm_train@Dimnames$features)
test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features)
setequal(featnames(Dfm_train), featnames(test_dfm ))
#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################
train <- as.matrix(Dfm_train) # dense matrix
object.size(train)
# a compressed sparse matrix!
trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix
object.size(trainSP )
object.size(train)/object.size(trainSP )
# this is going to make a HUGE difference when you have a large matrix as test and train!!!
# we will mainly use normal (i.e., not compressed) matrix in our class given that we will always use small dataset.
# But keeps this in mind!
test <- as.matrix(test_dfm)
#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################
#####################################################
# Let's start with a Naive Bayes Model
#####################################################
# we will use the naivebayes package. Another possibile package you can consider: the fastNaiveBayes package
# given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that your
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. And in fact:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command
# our DV
str(Dfm_train@docvars$Sentiment) # that's a character variable! Not good! So we will use as.factor to transorm it into a factor variable
set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(NB22 <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$Sentiment), laplace = 1))
summary(NB22)
prop.table(table(Dfm_train@docvars$Sentiment)) # prior probabilities
# let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates).
# take a look at "family" and "comedy". The former increases the probability of a positive review compared to the latter one
head(NB22$params)
# let's investigate about this issue a bit more
NB_prob <- as.data.frame(NB22$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# the features that change the most the difference between the positive and negative conditional probabilities
NB_prob$diff <- NB_prob$pos-NB_prob$neg
str(NB_prob)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # positive words
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # negative words
NB_prob$sign <- ifelse(NB_prob$diff>0,"positive","negative")
str(NB_prob)
# let's extract the top 20-most positive and the 20-most negative contributing features
NB_prob2 <- top_n(NB_prob, 20, diff )
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff )
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff))
ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_col(show.legend = F) +
coord_flip() +
ylab("Difference in the conditional probabilities") +
scale_fill_manual(values = c("orange", "blue")) +
labs(title = "Movie Reviews",
subtitle = "Negative (-) versus Positive (+) words - NB")
# let's FINALLY predict the test-set
predicted_nb <- predict(NB22 ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))
#####################################################
# let's run a Random Forest
#####################################################
# we will use the randomForest package. If you are employing compressed matrices, plz use the ranger package.
# the ranger package is also usually faster than the randomForest one
# note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters.
# More on this later on
set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE, do.trace=TRUE, ntree=500))
RF
# why 32 for the no. of variables randomly tried at each split? That's another hyperparameter of a RF
# The default is mtry=sqrt(p), where p is number of variables in x. In our case x=length(Dfm_train@Dimnames$features)= 1058!
# therefore sqrt(length(Dfm_train@Dimnames$features))=32.5)
# A natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample
# that provides an efficient and reasonable approximation of the test error. This provides a built-in validation set without
# any extra work on your part, and you do not need to sacrifice any of your training data to use for validation.
# This makes identifying the number of trees required to stablize the error rate during tuning more efficient;
# however, REMEMBER: this is less efficient than changing several hyperparameters and doing k-fold as we will see later on!
RF
# why OOB=20.6%? This estimate is calculated by counting however many points in the training set were misclassified
# (67 negative and 36 positive = 103) and dividing this number by the total number of observations (103/500 = 20.6%)
# We will discuss at great lenght about the Confusion matrix next week!
plot(RF) # shows: the errores for the two classes (red=negative; green=positive) and in the black the average
str(RF$err.rate)
error <- as.data.frame(RF$err.rate)
str(error)
# number of trees with lowest OOB
which.min(error$OOB) # 432
set.seed(123)
system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE, ntree=432, do.trace=TRUE))
RF2
# What about the importance of each feature for our trained model?
# Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides
# to the model for predicting the outcome
head(RF$importance[,3:4])
# let's grap the result
varImpPlot(RF )
# Each features’s importance is assessed based on two criteria:
# -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set.
# Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance,
# but would make the second variable more important.
# -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be?
# Highest purity means that each node contains only elements of a single class.
# Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.
# Plz note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded.
# The problem is that we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes.
# which are however the most important words for the positive label? and for the negative one?
# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,3:4])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))
# let's predict the training-set and let's store a new variable in our dfm of the training-set with such predictions
predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- ifelse(predicted_rf=="neg",0,1)
# NOTE: perfect prediction of the training-set!
table(Dfm_train@docvars$Sentiment, Dfm_train@docvars$predRF)
# adding sign [if 0/1 according to the content of the review - neg or pos]
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,])
}
# let's assign a word to pos/neg according to its frequency (if for example the word "bad" appears
# 10 times among prediceted negative reviews and 5 times among predicted positive reviews,
# we classify it as pertaining to he "negative reviews" world; and so on)
sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
# get the feature
names <- dimnames(train)[[2]]
str(names)
df <- data.frame(
Feature = names,
sign = sign-1,
stringsAsFactors=F)
str(df)
importance <- merge(importance_RF, df, by="Feature")
str(importance)
## best predictors
for (v in 0:1){
cat("\n\n")
cat("value==", v)
importance <- importance[order(importance$MeanDecreaseGini, decreasing=TRUE),]
print(head(importance[importance$sign==v,], n=10))
cat("\n")
cat(paste(unique(head(importance$Features[importance$sign==v], n=10)), collapse=", "))
}
# let's draw a graph with our results!
# first of all let's recode as negative values the MeanDecreaseGini for the negative features
importance$Gini <- ifelse(importance$sign>0,importance$MeanDecreaseGini,importance$MeanDecreaseGini*-1)
# the twop 20 positive words
importance2 <- top_n(importance, 20, Gini )
importance2
# the twop 20 negative words
importance3 <- top_n(importance, -20, Gini )
importance3
importance_new <- rbind(importance2, importance3)
str(importance_new)
# reorder the features
importance_new <- mutate(importance_new, Feature= reorder(Feature, Gini ))
importance_new$sign2<- ifelse(importance_new$sign>0,"positive","negative")
importance_new <- mutate(importance_new, Feature= reorder(Feature, Gini ))
ggplot(importance_new, aes(Feature, Gini , fill = sign2)) +
geom_col(show.legend = F) +
coord_flip() +
ylab("Mean Decrease Gini (we recoded as negative the values for the Negative features)") +
scale_fill_manual(values = c("orange", "blue")) +
labs(title = "Movie Reviews",
subtitle = "Negative (-) versus Positive (+) features - RF")
#### and with 3 categories? try to find it out by yourself!
# let's FINALLY predict the test-set
system.time(predicted_rf <- predict(RF, test,type="class"))
table(predicted_rf )
prop.table(table(predicted_rf))
#####################################################
# let's run a SVM
#####################################################
# note that here I select a linear kernel (in my experience a linear kernel is doing fine with texts data)
# and, as hyperameter, a specific value for the cost C(=1)- A SVM has also other hyperparameters. More on this later on
set.seed(123)# (define a set.seed for being able to replicate the results!)
system.time(SV <- svm(y= as.factor(Dfm_train@docvars$Sentiment), x=train, kernel='linear', cost = 1))
# how many supporting vectors?
length(SV$index)
nrow(train) # 352 out of 500 texts in the train data frame
head(SV$coefs)
# The coefficients that you're pulling out are the weights for the support vectors.
# Looking at the estimated coefficients is not as informative because they only tell us what support vectors were estimated in the model.
# But we can have a sense of what observations are more “important” or “separate” better the data by extracting the support vectors
# in the data matrix and then their corresponding coefficients (times the training labels):
# let's predict the training-set and let's store a new variable in our dfm of the training-set with such predictions
predicted_sv <- predict(SV, train, type="class")
table(predicted_sv )
Dfm_train@docvars$predSV <- ifelse(predicted_sv=="neg",0,1)
# Once again perfect prediction of the training-set!
table(Dfm_train@docvars$Sentiment, Dfm_train@docvars$predSV)
# let's identify the 343 vectors (and their corresponding texts!)
str(x)
df <- data.frame(
vector = x$text[SV$index],
coef = SV$coefs,
sentiment = predicted_sv[SV$index],
stringsAsFactors = F
)
str(df)
str(SV)
# take a look at "decision.values". You can read "neg/pos"
# A positive coeff. means in this case a "negative" review (the numerator) and viceversa
# negative rewiew (and positive value)
df <- df[order(df$coef, decreasing=TRUE),]
head(df[,c("coef", "sentiment", "vector")], n=10)
# positive review (and negative value)
df <- df[order(df$coef),]
head(df[,c("coef", "sentiment", "vector")], n=10)
# compute feature importance matrix
str(SV$coefs) # coefficients for the vectors
str(SV$SV) # the scaled coordinates of the vectors (for each feature)
dtb <- as.data.frame(SV$SV)
str(dtb)
W <- t(SV$coefs) %*% SV$SV # let's multiply the two set of coefficients
str(W)
W <- t(W)
str(W)
# The so estimated weights indicate what features best predict the source (in our case the type of movie
# review: either positive or negative); features that are most “telling” of the source/type of review.
W <- as.data.frame(W)
W$Feature <- row.names(W)
names(W)[1] <- "weights"
str(W)
# let's recode the feature for the Positive class as positive values and viceversa for the Negative class
W$weights <- W$weights*-1
W$sign <- ifelse(W$weights>0,"positive","negative")
print(head(W[order(W$weights, decreasing=TRUE),], 20)) # positive words
print(head(W[order(W$weights, decreasing=FALSE),], 20)) # negative words
# let's draw a graph!
W2 <- top_n(W, 20, weights)
str(W2)
W3 <- top_n(W, -20, weights)
str(W3)
W3
W_new <- rbind(W2, W3)
str(W_new)
# reorder the features
W_new <- mutate(W_new, Feature= reorder(Feature, weights))
ggplot(W_new, aes(Feature, weights, fill = sign)) +
geom_col(show.legend = F) +
coord_flip() +
ylab("weights") +
scale_fill_manual(values = c("orange", "blue")) +
labs(title = "Movie Reviews",
subtitle = "Negative (-) versus Positive (+) words - SVM")
# let's FINALLY predict the test-set
system.time(predicted_svm <- predict(SV , test))
table(predicted_svm )
prop.table(table(predicted_svm ))
######################################################
######################################################
# Let's compare the results out-of-sample we got via Naive Bayes, SVM & RF
######################################################
######################################################
prop.table(table(predicted_nb ))
prop.table(table(predicted_svm ))
prop.table(table(predicted_rf ))
results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )),
prop.table(table(predicted_svm ))))
str(results)
results$algorithm <- c("NB", "RF", "SVM")
str(results)
# Let's plot the results!
df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)
ggplot(df.long,aes(algorithm,value,fill=variable))+
geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +
ylab(label="Sentiment class in the test-set") + xlab("algorithm")
# and so, which result is to trust more than the other one?
# In our case we have the "true" reviews value! So we can say that! However, this is just an exception of course!
# The test-set is BY DEFINITION always unlabeled!
# how can we compare the true value with the predicted ones?
# let's estimate the MAE - mean average error (just to have a rough idea of the algorithms' performance).
# HOWEVER remember: this is NOT the MOST appropriate way to measure (and compare) the performance of ML algorithms that estimate individual classification of texts.
# A much better alternative (as we will discuss next week) is computing a Confusion matrix, and the accuracy/F1 statistics (more on that next week!)
prop.table(table(x10$Sentiment))
mae <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf )),
prop.table(table(predicted_svm )), prop.table(table(x10$Sentiment))))
str(mae)
mae$algorithm <- c("NB", "RF", "SVM", "TRUE")
mae
mae_NB <- (mean(abs(mae[1,1]-mae[4,1]))+mean(abs(mae[1,2]-mae[4,2])))/2
mae_RF <- (mean(abs(mae[2,1]-mae[4,1]))+mean(abs(mae[2,2]-mae[4,2])))/2
mae_SV <- (mean(abs(mae[3,1]-mae[4,1]))+mean(abs(mae[3,2]-mae[4,2])))/2
# NB appears to do better, followed by SVM and RF
mae_NB
mae_RF
mae_SV
# We will be back to this point in the next class when we will discuss about Cross-Validation and ML!