rm(list=ls(all=TRUE))
setwd("C:/Users/luigi/Dropbox/TOPIC MODEL")
getwd()
library(quanteda)
library(readtext)
library(caTools)
library(caret)
library(car)
library(xgboost)
library(ggplot2)
library(dplyr)
library(reshape)
library(Ckmeans.1d.dp)
#####################################################
# FIRST STEP: let's create the DfM for the training-set
#####################################################
# let's focus on MOVIE reviews (either positive or negative)
x <- read.csv("train_review2.csv", stringsAsFactors=FALSE)
str(x)
nrow(x)
table(x$Sentiment)
prop.table(table(x$Sentiment))
myCorpusTwitterTrain <- corpus(x)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in at least 5% of the reviews
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_train , 20) # 20 top words
#####################################################
# SECOND STEP: let's create the DfM for the test-set
#####################################################
x10 <- read.csv("test_review3.csv", stringsAsFactors=FALSE)
str(x10)
nrow(x10)
myCorpusTwitterTest <- corpus(x10)
tok2 <- tokens(myCorpusTwitterTest, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_test <- dfm(tok2)
Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 0.05, verbose=TRUE, docfreq_type = "prop")
topfeatures(Dfm_test , 20) # 20 top words
#####################################################
# THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern.
# after this step, we can "predict" by employing only the features included in the training-set!
# this moreover makes the DfM for the test-set smaller if you have a giant test-set compared
# to the training-set (i.e., of course this is not the case!)
#####################################################
setequal(featnames(Dfm_train), featnames(Dfm_test))
length(Dfm_test@Dimnames$features)
length(Dfm_train@Dimnames$features)
test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train))
length(test_dfm@Dimnames$features)
setequal(featnames(Dfm_train), featnames(test_dfm ))
#####################################################
# FOURTH STEP: Let's convert the DfM into matrices for the ML algorithms to work
#####################################################
train <- as.matrix(Dfm_train)
test <- as.matrix(test_dfm)
#####################################################
# FIFHT STEP: let's estimate a ML Model
#####################################################
#####################################################
# let's run a Gradient Boosting
#####################################################
# You need always to add the number of classes to be classified in the formula if it is a multi-categorical variable.
# In the present case, we have just two classes (negative & positive) so we do not care. Other situations could be
# different (as we will see)
numberOfClasses <- length(unique(Dfm_train@docvars$Sentiment))
numberOfClasses
# you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
num <- as.factor(Dfm_train@docvars$Sentiment)
num
table(num)
num <- as.numeric(num)
num
num[ num ==1 ] <-0
num[ num ==2 ] <-1
table(num)
# let's add back thie variable to our Dfm and let's call it "code"
Dfm_train@docvars$code <- num
str(Dfm_train@docvars)
# same results! Good!
table(Dfm_train@docvars$code)
table(Dfm_train@docvars$Sentiment)
# If you have just 2 classes (positive/negative) as in our example, you must write "objective = "binary:logistic".
# Note: logistic regression for binary classification returns predicted probability to belong to each given class (not the predicted class)!
# And if you have more than 2 classes in your DV? See below!
# There are several hyperparameters in a GB as we have discussed (back to this when we discuss about the grid-search)
# Here, I select a specific value for the number of trees ("nrounds") as well as for "eta".
# NOTE: "nthread" (Number of threads) implements a parallelization in your computer. And this makes everything faster!
set.seed(123)
system.time(xgb.fit1 <- xgboost(
data = train,
label = Dfm_train@docvars$code,
nrounds = 500,
eta = 1, nthread = 4,
objective = "binary:logistic", # for binary
eval_metric = "error", # binary classification error rate
verbose = 1 # not silent; if you want it silent "verbose=0"
))
# As evaluation metric here we are using the Binary classification error rate. It is calculated as (# wrong cases) / (# all cases).
# By default, it uses the 0.5 threshold for predicted values to define negative and positive instances. Different threshold (e.g., 0.) could be specified as "error@0."
# You can also decide to focus on the log-loss function discussed in class by specifying: eval_metric = "logloss" (lower loss scores, better the results)
# Let's do some Global Interpretation for our Gradient Boosting model!
# compute feature importance matrix
importance_matrix <- xgb.importance(model = xgb.fit1)
head(importance_matrix)
# Gain: the relative contribution of the corresponding feature to the model calculated by taking each feature’s contribution
# for each tree in the model. More in details: at each split in each tree, xgboost computes the improvement in the split-criterion (i.e., the reduction in
# the classification error). Then it averages the improvement made by each variable across all the trees that the variable is used.
# The variables with the largest average decrease in the classification errors are considered most important.
# Cover: the relative number of observations related to this feature. For example, you have 100 observations, 4 features and 3 trees,
# and suppose feature1 is used to decide the leaf node for 10, 5, and 2 observations in tree1, tree2 and tree3 respectively;
# then the metric will count cover for this feature as 10+5+2 = 17 observations. This will be calculated for all the 4 features and the cover will be
# 17 expressed as a percentage for all features’ cover metrics.
# Frequency: the percentage representing the relative number of times a particular feature occurs in the trees of the model.
# In the above example, if feature1 occurred in 2 splits, 1 split and 3 splits in each of tree1, tree2 and tree3; then the weightage for feature1
# will be 2+1+3 = 6. The frequency for feature1 is calculated as its percentage weight over weights of all features.
# let's focus on GAIN (quite common...)
importance <- importance_matrix[order(importance_matrix$Gain, decreasing=TRUE),]
head(importance, n=20)
# the problem here is that we just get the GAIN statistics w/o differentiating the words most important for the classes
# for example: which are the most important words for the positive label? and for the negative one?
# so let's improve on that following the same path that we followed with RF and SVM!
predicted_xgb <- round(predict(xgb.fit1, train))
table(predicted_xgb)
Dfm_train@docvars$predXGB <- predicted_xgb
# adding sign [if 0/1 according to the content of the review - neg or pos]
sums <- list()
for (v in 0:1){
sums[[v+1]] <- colSums(train[Dfm_train@docvars[,"predXGB"]==v,])
}
# let's assign a word to pos/neg according to its freqency (if for example the word "bad" appears
# 10 times among prediceted negative reviews and 5 times among predicted positive reviews,
# we classify it as pertaining to he "negative reviews" world; and so on)
sums <- do.call(cbind, sums)
sign <- apply(sums, 1, which.max)
str(sign)
# get the feature
names <- dimnames(train)[[2]]
str(names)
df <- data.frame(
Feature = names,
sign = sign-1,
stringsAsFactors=F)
str(df)
importance <- merge(importance_matrix, df, by="Feature")
str(importance)
## best predictors
for (v in 0:1){
cat("\n\n")
cat("value==", v)
importance <- importance[order(importance$Gain, decreasing=TRUE),]
print(head(importance[importance$sign==v,], n=10))
cat("\n")
cat(paste(unique(head(importance$Feature[importance$sign==v], n=10)), collapse=", "))
}
# Let's draw a graph!
str(importance)
importance$Gain_OK <- ifelse(importance$sign>0,importance$Gain,importance$Gain*-1)
importance2 <- top_n(importance, 20, Gain_OK)
importance2
importance3 <- top_n(importance, -20, Gain_OK)
importance3
importance_new <- rbind(importance2, importance3)
str(importance_new)
# reorder the features
importance_new <- mutate(importance_new, Feature= reorder(Feature, Gain_OK))
importance_new$sign2<- ifelse(importance_new$sign>0,"positive","negative")
importance_new <- mutate(importance_new, Feature= reorder(Feature, Gain_OK))
ggplot(importance_new, aes(Feature, Gain_OK, fill = sign2)) +
geom_bar(stat="identity", fill= "white") + theme_classic()+
geom_col(show.legend = T) + theme(legend.position="bottom")+
coord_flip() +
ylab("Gain coefficient (we recoded as negative the values for the Negative features)") +
scale_fill_manual(values = c("#000033", "#006699")) +
labs(title = "Movie Reviews",
subtitle = "Negative (-) versus Positive (+) words - XGB Model",
fill = "Movie Reviews")
# As an alternative to understand the relationship between a given feature and a class-lable, we could
# also estimate the SHAPLEY values: SHAPLEY values provide a value that indicates which effect the feature
# had for each observation and, crucially, its direction.
# The idea behind SHAPLEY values is to assess every combination of predictors to determine each predictor impact.
# Focusing on feature x1, the approach will test the accuracy of every combination of features not including x1,
# and then test how adding x1 to each combination improves the accuracy.
# Unfortunately, computing Shapley values is very computationally expensive (in particular when dealing with texts
# and thousands of features). Moreover, you should compute a SHAPLEY value (and its direction) for each feature
# included in each single observation, and then taking the average across observations. Another very long process...
# Unless you can take advantage of an already existing function:
# let's focus on the two most important features for our model
head(importance, n=2)
# let's plot the SHAPLEY value
xgb.plot.shap(train, model=xgb.fit1, top_n = 2, col = rgb(0, 0, 1, 0.8))
xgb.ggplot.shap.summary(train, model=xgb.fit1, top_n = 2)
# let's FINALLY predict the test data
predicted_xgb <- predict(xgb.fit1, test)
table(predicted_xgb)
# what's that??? Remember: logistic regression for binary classification in XGB returns predicted probability (not class).
# Therefore let's use "round"
predicted_xgb <- round(predict(xgb.fit1, test))
table(predicted_xgb)
prop.table(table(predicted_xgb))
######################################################
######################################################
# Let's make CROSS-VALIDATION with k-fold=5 with a loop!
# HERE you HAVE JUST 2 classes for the outcome (negative/positive)
######################################################
######################################################
# Some R packages, such as Caret, allows you to run a k-fold cross-validation with few lines of command.
# But I want that you understand what's going on, so better work with this loop! And then if you desire, move to Caret
# STEP 1: create the 5 folds
ttrain <- train # let's change the name of the original train data.frame, given that we are already going to use such name below in the loop
# let's split our training-set in 5 folds
set.seed(123) # set the see for replicability
k <- 5 # the number of folds; it does not matter the number of folds you decide here; the below procedure always will work!
folds <- cvFolds(NROW(ttrain ), K=k)
str(folds)
system.time(for(i in 1:k){
train <- ttrain [folds$subsets[folds$which != i], ]
validation <- ttrain [folds$subsets[folds$which == i], ]
set.seed(123)
newrf <- xgboost(
data = train,
label =Dfm_train[folds$subsets[folds$which != i], ]@docvars$code,
nrounds = 500,
eta = 1, nthread = 4,
objective = "binary:logistic", # for binary
eval_metric = "error", # binary classification error rate
verbose = 0) # here "code" NOT "Sentiment"!!!
newpred <- predict(newrf,newdata=validation, type="class")
newpred <- round( newpred)
class_table <- table("Predictions"= newpred, "Actual"=Dfm_train[folds$subsets[folds$which == i], ]@docvars$code)
print(class_table)
df<-confusionMatrix( class_table, mode = "everything")
df.name<-paste0("conf.mat.xgb",i)
assign(df.name,df)
})
# STEP 3: the metrics
XGBPredict <- data.frame(col1=vector(), col2=vector(), col3=vector())
for(i in mget(ls(pattern = "conf.mat.xgb")) ) {
Accuracy <-(i)$overall[1] # save in the matrix the accuracy value
p <- as.data.frame((i)$byClass)
F1_negative <- (2*(i)$byClass[1]*(i)$byClass[3])/((i)$byClass[1]+(i)$byClass[3]) # save in the matrix the F1 value for negative
F1_positive <- (2*(i)$byClass[2]*(i)$byClass[4])/((i)$byClass[2]+(i)$byClass[4]) # save in the matrix the F1 value for positive
XGBPredict <- rbind(XGBPredict , cbind(Accuracy , F1_negative, F1_positive))
}
XGBPredict [is.na(XGBPredict )] <- 0
str(XGBPredict )
# Let's compare the average value for accuracy and f1
acc_xgb_avg <- mean(XGBPredict [, 1] )
f1_xgb_avg <- mean(colMeans(XGBPredict [-1] ))
acc_xgb_avg
f1_xgb_avg
#####################################################
# let's prepare the training-set with 3 categories (this script works fine for any number of categories>2)
#####################################################
######################################################
######################################################
# which main changes when running an XGB?
######################################################
######################################################
uk_train <- read.csv("uk_train.csv")
str(uk_train)
myCorpusTwitterTrain <- corpus(uk_train)
tok2 <- tokens(myCorpusTwitterTrain, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
tok2 <- tokens_wordstem (tok2)
Dfm_train <- dfm(tok2)
# Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...)
Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE)
topfeatures(Dfm_train , 20) # 20 top words
train <- as.matrix(Dfm_train)
# our classes
table(Dfm_train@docvars$Sentiment)
# our benchmark: accuracy .530
prop.table(table(Dfm_train@docvars$Sentiment))
# REMEMBER: You need always to add the number of classes to be classified in the formula if it is a multi-categorical variable like now!
numberOfClasses <- length(unique(Dfm_train@docvars$Sentiment))
numberOfClasses
# you DV should be always a numeric one starting from 0. If it is not the case (as here) you need to create such variable
x <- as.factor(Dfm_train@docvars$Sentiment)
x
table(x)
x <- as.numeric(x)
table(x)
x[ x ==1 ] <-0
x[ x ==2 ] <-1
x[ x ==3 ] <-2 # CHANGE!
table(x)
Dfm_train@docvars$code <- x
str(Dfm_train)
table(Dfm_train@docvars$code)
table(Dfm_train@docvars$Sentiment)
# If you have more than 2 classes in your DV, you can write: "objective = "multi:softmax""
# This is a multiclass classification using the softmax objective, and it returns predicted class (not probabilities) contrary to objective = "binary:logistic".
# Alternatively you can use "multi:softprob" – same as softmax, but it returns predicted probability of each data point belonging to each class (that you need to round
# as in the binary example above)
# As discussed above, you also need in this case to set an additional num_class (number of classes) parameter defining the number of unique classes.
# Finally, in the case of a multinomial, you can either specify eval_metric = "merror" (i.e., the multiclass classification error rate calculated as
# ususal as (# wrong cases) / (# all cases)) or eval_metric = "mlogloss" (i.e., multiclass logloss)
set.seed(123)
system.time(xgb.fit1 <- xgboost(
data = train,
label = Dfm_train@docvars$code,
nrounds = 500,
eta = 1, nthread = 4,
objective = "multi:softmax", # for multi-category # CHANGE!
"num_class" = numberOfClasses, # add this line! # CHANGE!
eval_metric = "merror", # CHANGE
verbose = 1 # not silent; if you want it silent "verbose=0"
))