rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(randomForest)
library(caret)
library(naivebayes)
library(car)
library(ggplot2)
library(dplyr)
library(reshape2)
library(gridExtra)
# The data we will be using are some English social media disaster tweets discussed in
# this article: https://arxiv.org/pdf/1705.02009.pdf
# It consist of a number of tweets regarding accidents mixed in with a selection control tweets (not about accidents)
#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################
x <- read.csv("train_disaster.csv", stringsAsFactors=FALSE)
str(x)
# class-label variable: choose_one (0=tweets not relevant for accidencts; 1=relevant tweets for accidents)
table(x$choose_one)
prop.table(table(x$choose_one))
nrow(x)
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are short texts!)
# and let's keep only features with at least 2 characters
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
Dfm_train <- dfm_remove(Dfm_train , min_nchar = 2)
topfeatures(Dfm_train , 20) # 20 top words
# Always check if after trimming your dfm you have some texts with just 0s!
Dfm_train [ntoken(Dfm_train ) == 0,]
# Here problems with 6 texts! So let's delete such texts!
Dfm_train <- Dfm_train [ntoken(Dfm_train ) != 0,]
table(ntoken(Dfm_train ) == 0)
#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################
x10 <- read.csv("test_disaster.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
tok <- tokens(myCorpusTwitterTest , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE)
tok <- tokens_remove(tok, stopwords("en"))
tok <- tokens_remove(tok, c("0*"))
tok <- tokens_wordstem (tok)
Dfm_test <- dfm(tok)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE)
Dfm_test<- dfm_remove(Dfm_test, min_nchar = 2)
# Here problems with 11 texts!
Dfm_test[ntoken(Dfm_test) == 0,]
# So let's delete such texts!
Dfm_test <- Dfm_test[ntoken(Dfm_test) != 0,]
Dfm_test[ntoken(Dfm_test) == 0,]
topfeatures(Dfm_test , 20) # 20 top words
#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# After this step, we can "predict" the test-set by employing only the features included in the training-set.
# Avoiding to use dfm_match would generate a problem with RandomForest predictions on the test-set.
# That is, w/o dfm_match you would get the following error: Error in predict.randomForest - variables in the training data missing in newdata
#####################################################
setequal(featnames(Dfm_train), featnames(Dfm_test))
nfeat(Dfm_test)
nfeat(Dfm_train)
test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train))
nfeat(test_dfm)
setequal(featnames(Dfm_train), featnames(test_dfm ))
# of course if you have a unique dataset including both the training (i.e., the documents human-codified) and the test-set
# you could avoid this THIRD STEP, given that you would generate a unique Dfm with the same # of columns
# for all the documents irrespective of their status (i.e., being part of the training or the test-set).
# See below an example.
#####################################################
# FOURTH STEP: Let's convert the two DfMs into matrices for the ML algorithms to work
#####################################################
train <- as.matrix(Dfm_train) # dense matrix
object.size(train)
# Alternatively, we could have saved the dfm as a compressed sparse matrix!
# What's the meaning of that? Take a look at the EXTRA slides on the home-page!
trainSP <- as(Dfm_train, "dgCMatrix") # compressed matrix
object.size(trainSP )
object.size(train)/object.size(trainSP )
str(train)
str(trainSP)
# The difference between a dense and a compressed matrix is going to make a HUGE difference when you have a large matrix as test and train!!!
# We will mainly use dense (i.e., not compressed) matrix in our class given that we will always use small dataset.
# But keeps this in mind!
test <- as.matrix(test_dfm)
#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################
#####################################################
# Let's start with a Naive Bayes Model
#####################################################
# We will use the naivebayes package. Another possibile package you can consider is the fastNaiveBayes package
# Given our training-set, we have to use a Multinomial rather rather than a Binomial distribution given that our
# features can assume a value different than just 0/1, i.e., a one-hot-encoding. Indeed:
table(Dfm_train@x )
# to run a Binomial model with naivebayes just replace "multinomial_naive_bayes" with "bernoulli_naive_bayes" in the below command
# Which is our DV?
str(Dfm_train@docvars$choose_one) # that's an integer variable. Not good for a classification! It should always be a factor variable
# So we will use as.factor to transform it into a factor variable
system.time(NB <- multinomial_naive_bayes(x=train, y=as.factor(Dfm_train@docvars$choose_one), laplace = 1))
# Note that instead of y=as.factor(Dfm_train@docvars$choose_one) in the formula of the NB above we could have written
# y=as.factor(x$choose_one)! However I would always suggest you to extract the DV from the docvars of the DfM,
# especially if you have trimmed it and you have deleted some rows as we did
summary(NB)
prop.table(table(Dfm_train@docvars$choose_one)) # prior probabilities
# let's see the association between words and probabilities (i.e., matrix with class conditional parameter estimates - i.e., the likelihood!).
# take a look at "casualti" and "cream". The likelihood for the former is higher for a tweet discussing about a disaster (=1) compared
# to irrelavant tweets (=0), i.e., p(casualti|1)> p(casualti|0); the opposite happens for the word cream
head(NB$params)
# Let's investigate about this issue a bit more. Therefore, let's do some Global Interpretation of our NB!
NB_prob <- as.data.frame(NB$params)
NB_prob$Feature <- row.names(NB_prob)
str(NB_prob)
# let's estimate the features that change the most the difference between the relevant and irrelevant conditional probabilities
NB_prob$diff <- NB_prob[,2]-NB_prob[,1]
str(NB_prob)
# the features that present the highest absolute value in this difference can be also considered as the most important in
# affecting the overall performance of the algorithm
NB_prob$imp <- abs(NB_prob[,2]-NB_prob[,1])
str(NB_prob)
print(head(NB_prob[order(NB_prob$imp , decreasing=TRUE),], 15)) # most relevant words overall
# clearly, the features the present the highest positive values according to NB_prob$diff are the words
# most relevant for the "disaster" tweets (=1); those with the highest negative values are the words
# most relevant for the "irrelevant" tweets (=0)
print(head(NB_prob[order(NB_prob$diff , decreasing=TRUE),], 15)) # most relevant words for "disaster" tweets
print(head(NB_prob[order(NB_prob$diff , decreasing=FALSE),], 15)) # irrelevant words for the other tweets
NB_prob$sign <- ifelse(NB_prob$diff>0,"relevant","irrelevant")
str(NB_prob)
# let's extract the top 20-most irrelevant and the 20-most relevant contributing features
NB_prob2 <- top_n(NB_prob, 20, diff )
NB_prob2
NB_prob3 <- top_n(NB_prob, -20, diff )
NB_prob3
NB_prob_new <- rbind(NB_prob2, NB_prob3)
# reorder the features
NB_prob_new <- mutate(NB_prob_new, Feature= reorder(Feature, diff))
ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Difference in the likelihoods") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model",
fill = "Disaster")
# let's store the graph
NB_graph <- ggplot(NB_prob_new, aes(Feature, diff, fill = sign)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Difference in the likelihoods") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Naive Bayes Model",
fill = "Disaster")
# let's FINALLY predict the test-set
predicted_nb <- predict(NB ,test )
table(predicted_nb )
prop.table(table(predicted_nb ))
##########################
# ALTERNATIVE APPROACH
# let's create a unique dataset out of the training and the test-set and let's estimate on such dataset our ML algorithm
##########################
x$type <- "train"
str(x)
x10$type <- "test"
str(x10)
x_tot <- rbind(x, x10)
str(x_tot)
myCorpusTwitterTrain <- corpus(x_tot)
tok2 <- tokens(myCorpusTwitterTrain , remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE, remove_URL = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# let's also remove the unicode symbols
tok2 <- tokens_remove(tok2, c("0*"))
tok2 <- tokens_wordstem (tok2)
Dfm_tot <- dfm(tok2)
Dfm_tot <- dfm_trim(Dfm_tot , min_docfreq = 2, verbose=TRUE)
Dfm_tot <- dfm_remove(Dfm_tot , min_nchar = 2)
Dfm_tot [ntoken(Dfm_tot ) == 0,]
# So let's delete such texts!
Dfm_tot <- Dfm_test[ntoken(Dfm_tot ) != 0,]
head(docvars(Dfm_tot))
table(Dfm_tot@docvars$type)
# let's separate the texts included in our original training [in the "x" dataframe] form the test-set.
# number of texts in our training-set
train_index <- ndoc(Dfm_tot[Dfm_tot@docvars$type=="train",])
train_index
training <-c(1:train_index)
str(training)
testSet<-c((train_index+1):ndoc(Dfm_tot))
str(testSet)
db_tot <- as.matrix(Dfm_tot )
system.time(NB2 <- multinomial_naive_bayes(x=db_tot[training,], y=as.factor(Dfm_tot@docvars$choose_one[training]), laplace = 1))
predicted_nb2 <- predict(NB2 ,db_tot[testSet ,] )
table(predicted_nb2 )
round(prop.table(table(predicted_nb2 )),3)
# slightly different values than above. Why?
# It is related to the impact of dfm_trim done with a smaller dfm or a larger one (i.e., Dfm_train vs. Dfm_tot)
round(prop.table(table(predicted_nb )),3)
#####################################################
# let's run a Random Forest
#####################################################
# We will use the randomForest package. If you are employing compressed matrices, plz use the ranger package.
# the ranger package is also usually faster than the randomForest one
# Note that as hyperameter (or tuning-parameter), I select a specific value for the number of trees. A RF has also other hyperparameters.
# More on this next week!
set.seed(123) # (define a set.seed for being able to replicate the results!)
system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, do.trace=TRUE, ntree=500))
RF
# As discussed, a natural benefit of the bootstrap resampling process is that random forests have an out-of-bag (OOB) sample
# that provides a reasonable approximation of the test error. This provides a built-in validation set without any extra work on your part.
# However, REMEMBER: this is less efficient (especially with medium to large training-set) compared with
# doing a k-fold cross-validation as we will see in the next weeks!
str(RF$err.rate)
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes: irrelevant and relevant tweets)
plot(RF, type = "l", col = c("red", "black","springgreen4"), main = "Class errors in Random Forest Model / OOB")
legend("topright", horiz = F, cex = 0.7, fill = c( "black", "red", "springgreen4"),
c( "No-disaster tweets","Average error", "Disaster tweets"))
error <- as.data.frame(RF$err.rate)
# number of trees with the lowest OOB error
which.min(error$OOB) # 455
set.seed(123)
system.time(RF2 <- randomForest(y= as.factor(Dfm_train@docvars$choose_one), x=train, importance=TRUE, ntree=455, do.trace=TRUE))
RF$err.rate[nrow(RF$err.rate),] # our final error (overall, and for each of the two classes)
RF2$err.rate[nrow(RF2$err.rate),] # our final error (overall, and for each of the two classes)
# Now let's do some Global Interpretation for our RF!
# What about the importance of each feature for our trained model?
# Variable importance is a standard metric in machine learning, which indicates the amount of information a variable provides
# to the model for predicting the outcome
head(RF$importance[,3:4])
# let's grap the result
varImpPlot(RF )
# The importance of each featuress is assessed based on two criteria:
# -MeanDecreaseAccuracy: gives a rough estimate of the loss in prediction performance when that particular variable is omitted from the training set.
# Caveat: if two variables are somewhat redundant, then omitting one of them may not lead to massive gains in prediction performance,
# but would make the second variable more important.
# -MeanDecreaseGini: GINI is a measure of node impurity. Think of it like this: if you use this feature to split the data, how pure will the nodes be?
# Highest purity means that each node contains only elements of a single class.
# Assessing the decrease in GINI when that feature is omitted leads to an understanding of how important that feature is to split the data correctly.
# let's extract the matrix for GINI and Accuracy
importance_RF <- as.data.frame(RF$importance[,3:4])
str(importance_RF)
importance_RF$Feature<- row.names(importance_RF)
str(importance_RF)
# same words we get with
varImpPlot(RF )
print(head(importance_RF[order(importance_RF$MeanDecreaseGini, decreasing=TRUE),]))
# Note that these measures are used to rank variables in terms of importance and, thus, their absolute values could be disregarded.
# As a result, we just get for example the GINI statistics overall w/o differentiating the words most important for specific classes.
# Which are however the most important words for the relevant label? And for the irrelevant one?
# let's predict the training-set texts and let's store a new variable in our dfm of the training-set with such predictions.
# Why that? Cause we will use such variable to identify the "sign" of the most important features
predicted_rf <- predict(RF, train, type="class")
table(predicted_rf )
Dfm_train@docvars$predRF <- predicted_rf
str(Dfm_train@docvars)
# let's now assign a feature to the relevant/irrelevant class according to its frequency
# (if, for example, the word "collapse" appears 10 times among predicted relevant tweets and 5 times among predicted irrelevant tweets,
# we are going to classify it as pertaining to the "relevant tweets" world; and so on)
# First, let's estimate the number of times a word appear in those tweets that we have classified as related to accidents or otherwise
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predRF"]==v,])
}
str(sums)
sums <- do.call(cbind, sums) # do.call allows to use cbind on a list (i.e., collection of objects of different types as in the case of sums)
sums
# let's apply the sign (either 1 or 2) if, respectively, a word appears more in the tweets classified as not related to accidents or related to them
sign <- apply(sums, 1, which.max)
# get the feature
names <- dimnames(train)[[2]]
str(names)
df <- data.frame(
Feature = names,
sign = sign-1, # let's now recode the sign from 1 and 2 to 0 and 1 (0=no accidentes; 1=accidents)
stringsAsFactors=F)
str(df)
importanceRF <- merge(importance_RF, df, by="Feature")
str(importanceRF)
## best predictors
for (v in 0:1){
cat("\n\n")
cat("value==", v)
importanceRF <- importanceRF [order(importanceRF $MeanDecreaseGini, decreasing=TRUE),]
print(head(importanceRF [importanceRF $sign==v,], n=10))
cat("\n")
cat(paste(unique(head(importanceRF $Features[importanceRF $sign==v], n=10)), collapse=", "))
}
# let's draw a graph with our results focusing on MeanDecreaseGini
importanceRF$Gini <- ifelse(importanceRF $sign==1,importanceRF $MeanDecreaseGini,importanceRF $MeanDecreaseGini*-1)
# the top 20 words for the disaster tweets
importance2RF <- top_n(importanceRF, 20, Gini )
importance2RF
# the top 20 words for teets not related to disasters
importance3RF <- top_n(importanceRF, -20, Gini )
importance3RF
importance_newRF <- rbind(importance2RF, importance3RF)
str(importance_newRF)
# reorder the features
importance_newRF <- mutate(importance_newRF, Feature= reorder(Feature, Gini ))
# let's finally create a new string variabile according to the value of sign
importance_newRF$sign2<- ifelse(importance_newRF$sign==1,"relevant","irrelevant")
str(importance_newRF)
ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model",
fill = "Disaster")
# let's store the graph
RF_graph <- ggplot(importance_newRF, aes(Feature, Gini, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Mean Decrease Gini coefficient") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Disaster tweets",
subtitle = "Irrelevant (-) versus Relevant (+) words - Random Forest Model",
fill = "Disaster")
# let's FINALLY predict the test-set. We define the seed to get always the same prediction. Otherwise, you can get each time slighty
# different results according to how to randomly classify a given document when you have a draw across decision trees
# (i.e., say 250 trees say "positive" and 250 trees say "negative")
set.seed(123)
system.time(predicted_rf <- predict(RF, test))
table(predicted_rf )
set.seed(14)
system.time(predicted_rf <- predict(RF, test))
table(predicted_rf )
# note that if you add: "predict.all=TRUE", then the returned object is a list
# of 2 components: aggregate, which is the vector of predicted values by the forest (as above),
# and individual, which is a matrix where each column contains prediction by a tree in the forest.
set.seed(123)
system.time(predicted_rf2 <- predict(RF, test, predict.all=TRUE))
str(predicted_rf2)
table(predicted_rf2$aggregate )
nrow(test)
head(predicted_rf2$individual)
######################################################
######################################################
# Let's compare the results out-of-sample we got via Naive Bayes & RF
######################################################
######################################################
prop.table(table(predicted_nb ))
prop.table(table(predicted_rf ))
results <- as.data.frame(rbind(prop.table(table(predicted_nb )), prop.table(table(predicted_rf ))))
str(results)
results$algorithm <- c("NB", "RF")
str(results)
# Let's plot the results!
library(reshape2)
df.long<-melt(results,id.vars=c("algorithm"))
str(df.long)
ggplot(df.long,aes(algorithm,value,fill=variable))+
geom_bar(position="dodge",stat="identity") + theme(axis.text.x = element_text(color="#993333", size=10, angle=90)) + coord_flip() +
ylab(label="Review class in the test-set") + xlab("algorithm") + scale_fill_discrete(name = "Prediction", labels = c("Irrelevant", "Relevant"))
# Similar but not exactly the same results for the prediction!
# And so, which result is to trust more than the other one?
# For answering to that, we need to introduce the Cross-Validation procedure...
#################################################
# let's compare feature importance across models
#################################################
library(gridExtra)
grid.arrange(NB_graph, RF_graph, nrow=2) # Plot everything together
# let's check some correlation
# let's merge the dataframe
words <- merge(NB_prob, importanceRF, by="Feature")
str(words)
library(PerformanceAnalytics)
chart.Correlation(words[c(5, 7:8)])