####################################################
# classifiers with a general example
####################################################

rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(e1071)
library(caTools)
library(randomForest)
library(caret)
library(quanteda)
library(readtext)

data(iris)
str(iris)

# This famous iris data set gives the measurements in centimeters of the variables sepal length and width and petal length and width, respectively, 
# for 50 flowers from each of 3 species of iris. The species are Iris setosa, versicolor, and virginica.

# let's run a v-fold cross-validation (v=2) 
set.seed(101) # Set Seed so that same sample can be reproduced in future also
# Now Selecting 50% of data as sample from total 'n' rows of the data  
sample <- sample.int(n = nrow(iris), size = floor(.50*nrow(iris)), replace = F)
train <- iris[sample, ]
test  <- iris[-sample, ]
# I consider train as V1 and test as V2
nrow(train)
nrow(test)

####################################################
# An example with Naļve Bayes classifier
####################################################

# first let's use V1 to train the algorithm and then predict V2
set.seed(1234)
system.time(NB <- naiveBayes(Species ~ ., data=train))
predictNB <- predict(NB, newdata=test)
table( predictNB)
# Let's estimate the "confusion matrix"
table("Predictions"= predictNB, "Actual"=test$Species)

# Accuracy (number of documents on the diagonal/total number of documents)
(26+15+28) / nrow(test)
x <- as.matrix(table("Predictions"= predictNB, "Actual"=test$Species))
sum(diag(x)) / nrow(test)
acc <- sum(diag(x)) / sum(x)
acc

# Let's use the confusionMatrix command (same results as above for accuracy!)
conf.mat <- confusionMatrix( predictNB, test$Species)
conf.mat

# Precision for versicolor (=Pos Pred Value; True Positive/Predicted Positive or TP/(TP+FP))
15/18
# Recall for virginica (=Sensitivity; True Positive/Actual Positive or TP/(TP+FN))
28/31

# Now let's reverse che cross-validation procedure, and let's use test (V2) to train the algorithm and then predict train (V1)!
set.seed(1234)
NB2 <- naiveBayes(Species ~ ., data=test)
system.time(predictNB2 <- predict(NB2, newdata=train))
# Let's estimate the "confusion matrix"
table("Predictions"= predictNB2, "Actual"=train$Species)
x_alt <- as.matrix(table("Predictions"= predictNB2, "Actual"=train$Species))
sum(diag(x_alt)) / sum(x_alt)
acc_alt <- sum(diag(x_alt)) / sum(x_alt)

# Let's use the confusionMatrix command 
conf.mat_alt <- confusionMatrix( predictNB2, train$Species)
conf.mat_alt

# K-Fold cross-validation (with k=2)
# Accuracy (average across the two cross-validation procedure, i.e., V1 vs. V2 and V2 vs. V1)
(acc_alt+acc)/2
# Precision (=Pos Pred Value) for versicolor 
conf.mat
conf.mat_alt
(0.8333+1)/2
# Recall (=Sensitivity) for virginica
(0.9032+1)/2

####### REMEMBER: when you have a (relative) small training-set, doing cross-validation is even more important, 
####### given that it allows you to control if you results are driven by the codification (errors? bad codification? outlier?) 
####### of a specific subset of documents.
####### This appears when the accuracy of V1 vs. V2 is drammaticaly different from the one you get from V2 vs. V1!

####################################################
# An example with Random Forest (we just use V1 to predict V2 here to save space...)
####################################################

set.seed(123)
# by default we build 500 trees
RF <- randomForest(Species ~ ., data=train, type="classification")
system.time(predictRF <- predict(RF, newdata=test))
table( predictRF)
table(  test$Species, predictRF)
# Accuracy (proportion of correctly classified documents)
x2 <- as.matrix(table("Predictions"= predictRF, "Actual"=test$Species))
sum(diag(x2)) / sum(x2)

# Let's use the confusionMatrix command (same results as above for accuracy!)
conf.mat2 <- confusionMatrix( predictRF, test$Species)
conf.mat2

####################################################
# An example with SVM (we just use V1 to predict V2 here to save space...)
####################################################

set.seed(123)
SVM <- svm(Species ~ . ,data=train, type="C-classification")
summary(SVM)
system.time(predictSVM <-predict(SVM, newdata=test))
table( predictSVM )
table(  test$Species, predictSVM )
# Accuracy (proportion of correctly classified documents)
x3 <- as.matrix(table("Predictions"= predictSVM, "Actual"=test$Species))
sum(diag(x3)) / sum(x3)

# Let's use the confusionMatrix command (same results as above for accuracy!)
conf.mat3 <- confusionMatrix( predictSVM, test$Species)
conf.mat3

####### REMEMBER: you can use cross-validation also to compare the performance of different algorithms!
####### For example: in this example, among the three classifiers who was the best one according to
####### accuracy, precision and recall?

####################################################
# classifiers with texts
####################################################

# ### This is a sample of 1000 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018
x10 <- read.csv("Trump_tweets2.csv", stringsAsFactors=FALSE)
str(x10)
myCorpusTwitter <- corpus(x10)

# let's suppose that the sentiment classification via data_dictionary_LSD2015 represents the "true" sentiment that
# a set of human coders would have found

trump3 <- dfm(myCorpusTwitter , remove = c(stopwords("english"), ("rt"), ("t.co") ),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE,  remove_twitter = TRUE, 
           remove_url = TRUE  ,  dictionary = data_dictionary_LSD2015[1:2] )
str(trump3 )
df <- data.frame(trump3 )
str(df)
head(df)
df$sentiment_diff <- df$positive-df$negative
str(df)

# we assume 0=neutral. Alternatively, we could have forced everythin between -1 and +1 to be neutral

df$sentiment2[df$sentiment_diff==0] <- "Neutral"
df$sentiment2[df$sentiment_diff<0] <- "Negative"
df$sentiment2[df$sentiment_diff>0] <- "Positive"
str(df)
table(df$sentiment2)
df$sentiment2 <- as.factor(df$sentiment2)
str(df)

# add the sentiment to the original set of tweets
x10$sentiment <- df$sentiment2
str(x10)
table(x10$sentiment)

# removing all punctuations also here (it's a mess with tweets!)
library(stringr)
x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ")
kwic(x10$text, "Trump's")

# let's re-build the corpus (with the added sentiment variable) and the dfm

myCorpusTwitter  <- corpus(x10)
myDfm3 <- dfm(myCorpusTwitter  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
topfeatures(myDfm3 , 20)  # 20 top words

# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
trim <- dfm_trim(myDfm3 , min_docfreq= 0.05)

# transform the data in a data frame
data <- as.data.frame(as.matrix(trim))
str(data )

# this is important: randomForest can't recognize the colname that begins with space, comma, number or other specific punctuation.
# the command below would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested
colnames(data ) <- make.names(colnames(data ))

summary(x10$sentiment)
data$sentiment<- x10$sentiment
colnames(data)
summary(data$sentiment)

# Build a training and testing set. Given that we are assuming that the sentiment classification via data_dictionary_LSD2015 represents
# a kind of "human coding" we are in effect running a kind of v-fold cross-classification procedure (with v=2) somehow!

set.seed(123) # Set Seed so that same sample can be reproduced in future also
# Now Selecting 60% of data as sample from total 'n' rows of the data  
sample <- sample.int(n = nrow(data), size = floor(.60*nrow(data)), replace = F)
train <- data[sample, ]
test  <- data[-sample, ]
nrow(train)
nrow(test)

###########################################
# Try a random forest model. 
###########################################

set.seed(123)
system.time(RF <- randomForest(sentiment~ ., data=train, type="classification"))
predictRF <- predict(RF, newdata=test)
table( predictRF)
table(  test$sentiment, predictRF)
# Accuracy (proportion of correctly classified documents)
x2 <- as.matrix(table("Predictions"= predictRF, "Actual"=test$sentiment))
sum(diag(x2)) / nrow(test)
sum(diag(x2)) / sum(x2)

# Let's use the confusionMatrix command (same results as above for accuracy!)
conf.mat <- confusionMatrix( predictRF, test$sentiment)
conf.mat

# Accuracy (proportion of correctly classified documents): 0.698 
# But see Precision (=Pos Pred Value) for Negative and even more
# Recall (=Sensitivity) for Neutral and Positive...

###########################################
# Try a SVM model. 
###########################################

# Try a SVM model
# I would suggest you to go for linear kernel if you have large number of features (>1000 as in our case with >2800 words) 
# because it is more likely that the data is linearly separable in high dimensional space

set.seed(123)
system.time(SV <- svm(sentiment ~ . ,data=train,method = "C-classification", kernel='linear'))

# If you ever get this warning message (as it is the case now!):
# "In svm.default(x, y, scale = scale, ..., na.action = na.action) :
# Variable(s) ‘XXX’ and ‘YYY’ and ‘WWW’ constant. Cannot scale data"
# it means that in the dfm in the training-set you have some words with always 0 across all texts.
# This is a problem, cause per default, data in a SVM are scaled to zero mean and unit variance. 
# The center and scale values are returned and used for later predictions. But if you have just a string of 0s...
# So what to do? Either you drop such words and re-run the analysis or you write in the SVM command: "scale=FALSE"
# The first option is always better!

# delete all the words with sum=0
colnames(train)
colSums(train[,1:3596])
train <- train[, colSums(train[,1:3596]!= 0) > 0]

set.seed(123)
system.time(SV <- svm(sentiment ~ . ,data=train,method = "C-classification", kernel='linear'))
predictSV <-predict(SV, newdata=test)
table( predictSV )

# Let's use the confusionMatrix command (same results as above for accuracy!)
conf.mat2 <- confusionMatrix( predictSV, test$sentiment)
conf.mat2

###########################################
## Try a Naive Bayes model: back to Quanteda
###########################################

#  Since Naive Bayes evaluates products of probabilities, we need some way of assigning 
#  non-zero probabilities to words which do not occur in the sample. 
#  That is, Naive Bayes can only take features into consideration that occur both in the training set 
#  and the test set, but we can make the features identical by passing training_dfm to dfm_select() as a pattern.

summary(myCorpusTwitter)
# generate 600 numbers without replacement to treat them as the training-set 
set.seed(123)
id_train <- sample(1:1000, 600, replace = FALSE)
head(id_train, 10)

# create docvar with ID
docvars(myCorpusTwitter, "id_numeric") <- 1:ndoc(myCorpusTwitter)
summary(myCorpusTwitter)

# get training set (documents in id_train) and compute the dfm out of it
training_corpus <- corpus_subset(myCorpusTwitter, id_numeric %in% id_train)
summary(training_corpus )
training_dfm <- dfm(training_corpus  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
training_dfm <- dfm_trim(training_dfm , min_docfreq= 0.05)

# get test set (documents not in id_train) and compute the dfm out of it
test_corpus <- corpus_subset(myCorpusTwitter, !id_numeric %in% id_train)
summary(test_corpus )
test_dfm <-  dfm(test_corpus , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
test_dfm <- dfm_trim(test_dfm , min_docfreq= 0.05)

# train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default)
nb <- textmodel_nb(training_dfm, docvars(training_dfm, "sentiment"), distribution = c("multinomial"))
summary(nb)

# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(test_dfm, training_dfm)

# Let’s inspect how well the classification worked
actual_class <- docvars(test_dfm, "sentiment")
predicted_class <- predict(nb, test_dfm)
table(predicted_class)
prop.table(table(predicted_class ))
class_table <- table(actual_class, predicted_class)
class_table
confusionMatrix(class_table, mode = "everything")

####################################
#### replicate the entire analysis but now reduces the training set to 0.3.
#### Which are the main changes in terms of accuracy? It decreases for RF, SVM and naive Bayes classifier (less data on which train the algorithm!)
####################################

trim <- dfm_trim(myDfm3 , min_docfreq= 0.05)
data <- as.data.frame(as.matrix(trim))
str(data )
colnames(data )
colnames(data ) <- make.names(colnames(data ))
colnames(data )
summary(x10$sentiment)
data$sentiment<- x10$sentiment
str(sentiment)
colnames(sentiment)
summary(data$sentiment)

set.seed(123)
sample <- sample.int(n = nrow(data), size = floor(.30*nrow(data)), replace = F)
train <- data[sample, ]
test  <- data[-sample, ]
nrow(train)
nrow(test)

#################################
# Try a random forest model [not in the class!]
#################################
set.seed(123)
tweetRF <- randomForest(sentiment~ ., data=train, type="classification")
str(tweetRF)
predictRF <- predict(tweetRF, newdata=test)
table( predictRF)
table(test$sentiment)
table(test$sentiment, predictRF)
# Let's use the confusionMatrix command (same results as above for accuracy!)
conf.mat <- confusionMatrix( predictRF, test$sentiment)
conf.mat

#################################
# Try a SVM model
#################################
# delete all the words with sum=0
colnames(train)
colSums(train[,1:1694])
train <- train[, colSums(train[,1:1694]!= 0) > 0]
set.seed(123)
system.time(SV <- svm(sentiment ~ . ,data=train,method = "C-classification", kernel='linear'))
predictSV <-predict(SV, newdata=test)
table( predictSV )
# Let's use the confusionMatrix command (same results as above for accuracy!)
conf.mat2 <- confusionMatrix( predictSV, test$sentiment)
conf.mat2

#################################
## Try a Naive Bayes model
#################################
summary(myCorpusTwitter)
# generate 600 numbers without replacement to treat them as the training-set 
set.seed(123)
id_train <- sample(1:1000, 300, replace = FALSE)
head(id_train, 10)

# create docvar with ID
docvars(myCorpusTwitter, "id_numeric") <- 1:ndoc(myCorpusTwitter)
summary(myCorpusTwitter)

# get training set (documents in id_train) and compute the dfm out of it
training_corpus <- corpus_subset(myCorpusTwitter, id_numeric %in% id_train)
summary(training_corpus )
training_dfm <- dfm(training_corpus  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
training_dfm <- dfm_trim(training_dfm , min_docfreq= 0.05)

# get test set (documents not in id_train) and compute the dfm out of it
test_corpus <- corpus_subset(myCorpusTwitter, !id_numeric %in% id_train)
summary(test_corpus )
test_dfm <-  dfm(test_corpus , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
test_dfm <- dfm_trim(test_dfm , min_docfreq= 0.05)

# train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default)
nb <- textmodel_nb(training_dfm, docvars(training_dfm, "sentiment"), distribution = c("multinomial"))
summary(nb)

# Let's make the features identical by passing training_dfm to dfm_select() as a pattern.
test_dfm <- dfm_select(test_dfm, training_dfm)

# Let’s inspect how well the classification worked
actual_class <- docvars(test_dfm, "sentiment")
predicted_class <- predict(nb, test_dfm)
table(predicted_class)
prop.table(table(predicted_class ))
class_table <- table(actual_class, predicted_class)
class_table
confusionMatrix(class_table, mode = "everything")