rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL")
getwd()
library(e1071)
library(caTools)
library(randomForest)
library(caret)
library(quanteda)
library(readtext)
library(iSAX)
library(ReadMe)

##################################################
# iSA
##################################################
### TRAIN-TEST
# This dataset is a sample of tweets mentioning the official account of Donald Trump “@realDonaldTrump”, 
# on dates 7–13 June 2016, written in English and coming from the US. 
# Data have been collected through Twitter API also specifying language and origin of tweets. 
# This dataset include a sample of around 482 tweets that have been manually codified by a group
# of students. The coding stage involved detecting the sentiment towards Trump (negative, positive,
# neutral).

x <- read.csv("Trump-orig3.csv", stringsAsFactors=FALSE)
str(x)
table(x$Sentiment)
x$Sentiment <- as.factor(x$Sentiment)
str(x)
table(x$Sentiment)
prop.table(table(x$Sentiment))

### TEST-SET
### This is a sample of 1000 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018
x10 <- read.csv("Trump_tweets2.csv", stringsAsFactors=FALSE)
str(x10)
x10$X <- NULL
str(x10)

### Let's create a unique dataset including both TEST and TRAINING SET 
x10$Sentiment <- NA
str(x10)
documents <- rbind(x10, x)
str(documents )
table(documents $Sentiment)
prop.table(table(documents $Sentiment))

### iSA needs the TM package (not Quanteda!) to build the DFM
corpus <- VCorpus(VectorSource(documents $text))
length(corpus)
str(corpus[[1]])
ocome <- prep.data(corpus,verbose=TRUE, th=0.995)
str(ocome)

### let's separate the two data frames according to the presence or absence of info about the Sentiment (i.e., training vs. test-set)

train <- !is.na(documents $Sentiment)
summary(train)
table(documents $Sentiment)
table(train)
D <- documents $Sentiment[train]
prop.table(table(D))
str(D)

Strain <- ocome$S[which(train)]
Stest <- ocome$S[-which(train)]

set.seed(123)
outSent <- iSA(Strain ,Stest , D) 
round(outSent$btab, 5)
# training-set %
prop.table(table(D ))

### remember the results I got from the NB model
# negative  neutral positive 
#   0.461    0.326    0.213 

##################################################
# ReadMe
##################################################

# I recover from the ocome object the DTM
str(ocome)
dtm <- ocome$dtm
dtm1 <- data.frame(dtm)
head(str(dtm1 ))

documents $FILENAME= 1:nrow(documents )
dtm1$sentiment<- as.factor(documents $Sentiment)
dtm1$FILENAME<- as.factor(documents $FILENAME)
summary(dtm1$sentiment)

trainingset<- dtm1[ which(!is.na(dtm1$sentiment)), ]
testset<- dtm1[ which(is.na(dtm1$sentiment)), ]

nrow(trainingset)
nrow(testset)
prop.table(table(trainingset$sentiment))

trainingset$TRAININGSET<-1
testset$TRAININGSET<-0

names(trainingset)[names(trainingset) == 'sentiment'] <- 'TRUTH'
names(testset)[names(testset) == 'sentiment'] <- 'TRUTH'

trainingset<- data.frame(TRAININGSET=trainingset$TRAININGSET,  subset(trainingset, select=-TRAININGSET))
trainingset<- data.frame(TRUTH=trainingset$TRUTH,  subset(trainingset, select=-TRUTH))
trainingset<- data.frame(FILENAME=trainingset$FILENAME,  subset(trainingset, select=-FILENAME))

testset<- data.frame(TRAININGSET=testset$TRAININGSET,  subset(testset, select=-TRAININGSET))
testset<- data.frame(TRUTH=testset$TRUTH,  subset(testset, select=-TRUTH))
testset<- data.frame(FILENAME=testset$FILENAME,  subset(testset, select=-FILENAME))

nrow(testset)
nrow(trainingset)

Out <- list(trainingset= trainingset, testset= testset)

myRme <- function(trainset, testset, feat=NULL){
    ptm <- proc.time()
    cat("\nReadme...\n")
    require(ReadMe)
    testset$TRAININGSET <- 0
    Out<-list(trainingset =trainset, testset =testset)
    Out$features <- 16
    Out$n.subset <- 300
    Out$prob.wt <- 1
    Out$boot.se <- FALSE
    Out$nboot <- 300
    Out$printit <- TRUE
    n <- length(table(trainset$TRUTH))
    Out$formula <- as.formula(
     paste(
      paste(colnames(trainset)[-(1:3)], collapse= "+"),
      "~ TRUTH ")
      )
      #  print(str(Out))
    Out <- preprocess(Out)
    tmp <- try(capture.output( readme(Out, features=feat) -> aa ), TRUE)
    if(class(tmp) == "try-error") {
        return(list(est=matrix(NA,n,1),time=NA))
    } #    tmp <-  rme(Out, features=6) -> aa
    tmp <- matrix(aa$est, length(aa$est),1)
    rownames(tmp) <- names(aa$est)
    colnames(tmp) <- "ReadMe"
    etime <- (proc.time()-ptm)[1]
    cat(sprintf("\nElapsed time: %.2f seconds\n",etime))
    return(list(est=tmp,time=etime))
}

set.seed(123)
Rmefull <- myRme(Out$trainingset, Out$testset, feat=8)
str(Rmefull)
Rmefull$est
round(Rmefull$est, 5)
round(outSent$btab, 5)

##################################################
# How to assess the quality of iSA and ReadMe estimates?
##################################################

# Create two training-sets from the training-set you have, and estimate a MAE on it!

##################
# iSA training set1 vs training set2
##################

# create two training-sets from the original one
set.seed(123)
split <- sample.split(Strain, SplitRatio=0.5)
table(split)
train1_ISA <- subset(Strain, split==TRUE)
train2_ISA <- subset(Strain, split==FALSE)
length(train1_ISA)
length(train2_ISA)
D_train1_ISA <-subset(D, split==TRUE)
prop.table(table(D_train1_ISA))
D_train2_ISA <-subset(D, split==FALSE)
prop.table(table(D_train2_ISA))

set.seed(123)
outSent <- iSA(train1_ISA ,train2_ISA , D_train1_ISA) 
str(outSent)
round(outSent$btab, 5)

##################
# ReadMe training set1 vs training set2
##################

# I recover from the ocome object the DTM
str(ocome)
dtm <- ocome$dtm
dtm1 <- data.frame(dtm)
head(str(dtm1 ))

documents $FILENAME= 1:nrow(documents )
dtm1$sentiment<- as.factor(documents $Sentiment)
dtm1$FILENAME<- as.factor(documents $FILENAME)
summary(dtm1$sentiment)

trainingset2<- dtm1[ which(!is.na(dtm1$sentiment)), ]
testset2<- dtm1[ which(is.na(dtm1$sentiment)), ]

nrow(trainingset2)
nrow(testset2)

trainingset<- subset(trainingset2, split==TRUE)
testset<- subset(trainingset2, split==FALSE)
prop.table(table(trainingset$sentiment))
prop.table(table(testset$sentiment))

trainingset$TRAININGSET<-1
testset$TRAININGSET<-0

names(trainingset)[names(trainingset) == 'sentiment'] <- 'TRUTH'
names(testset)[names(testset) == 'sentiment'] <- 'TRUTH'

trainingset<- data.frame(TRAININGSET=trainingset$TRAININGSET,  subset(trainingset, select=-TRAININGSET))
trainingset<- data.frame(TRUTH=trainingset$TRUTH,  subset(trainingset, select=-TRUTH))
trainingset<- data.frame(FILENAME=trainingset$FILENAME,  subset(trainingset, select=-FILENAME))

testset<- data.frame(TRAININGSET=testset$TRAININGSET,  subset(testset, select=-TRAININGSET))
testset<- data.frame(TRUTH=testset$TRUTH,  subset(testset, select=-TRUTH))
testset<- data.frame(FILENAME=testset$FILENAME,  subset(testset, select=-FILENAME))

nrow(testset)
nrow(trainingset)

Out <- list(trainingset= trainingset, testset= testset)

myRme <- function(trainset, testset, feat=NULL){
    ptm <- proc.time()
    cat("\nReadme...\n")
    require(ReadMe)
    testset$TRAININGSET <- 0
    Out<-list(trainingset =trainset, testset =testset)
    Out$features <- 16
    Out$n.subset <- 300
    Out$prob.wt <- 1
    Out$boot.se <- FALSE
    Out$nboot <- 300
    Out$printit <- TRUE
    n <- length(table(trainset$TRUTH))
    Out$formula <- as.formula(
     paste(
      paste(colnames(trainset)[-(1:3)], collapse= "+"),
      "~ TRUTH ")
      )
      #  print(str(Out))
    Out <- preprocess(Out)
    tmp <- try(capture.output( readme(Out, features=feat) -> aa ), TRUE)
    if(class(tmp) == "try-error") {
        return(list(est=matrix(NA,n,1),time=NA))
    } #    tmp <-  rme(Out, features=6) -> aa
    tmp <- matrix(aa$est, length(aa$est),1)
    rownames(tmp) <- names(aa$est)
    colnames(tmp) <- "ReadMe"
    etime <- (proc.time()-ptm)[1]
    cat(sprintf("\nElapsed time: %.2f seconds\n",etime))
    return(list(est=tmp,time=etime))
}

set.seed(123)
Rmefull <- myRme(Out$trainingset, Out$testset, feat=8)
str(Rmefull)
Rmefull$est
round(Rmefull$est, 5)

# to compare with iSA, let's add to the ReadMe estimates for the test-set, the actual values of the training-set
x2 <-as.data.frame(table(D_train1_ISA))
str(x2)
colnames(x2)[2] <- "Real_Freq"
str(x2)

y2 <- as.data.frame(Rmefull$est)
str(y2)
x2$ReadMeFreq <- round(y2$ReadMe*236,0)
x2
ReadMe <- x2

ReadMe $ReadMeFreqTot <- (ReadMe $Real_Freq+x2$ReadMeFreq)
prop.table( ReadMe $ReadMeFreqTot)

##################
# RandomForest training set1 vs training set2
##################

dtm <- ocome$dtm
dtm1 <- data.frame(dtm)

# this would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested
colnames(dtm1) <- make.names(colnames(dtm1))

table(documents $Sentiment)
dtm1$sentiment<- as.factor(documents $Sentiment)
summary(dtm1$sentiment)

train <- dtm1[ which(!is.na(dtm1$sentiment)), ]
test <- dtm1[ which(is.na(dtm1$sentiment)), ]

nrow(train)
nrow(test)

train1_RF <- subset(train , split==TRUE)
train2_RF <- subset(train , split==FALSE)

prop.table(table(train1_RF$sentiment))
prop.table(table(train2_RF$sentiment))

# Try a random forest model: train vs. test 
set.seed(123)
tweetRF <- randomForest(sentiment~ ., data=train1_RF, type="classification")
predictRF <- predict(tweetRF, newdata=train2_RF)
prop.table(table( predictRF))

# to compare with iSA, let's add to the RF estimates for the test-set, the actual values of the training-set
x2 <- table(train1_RF$sentiment)
y <- table( predictRF )
RF <- x2 + y
prop.table( RF)

##################
# Naive Bayes training set1 vs training set2
##################

str(x)
table(x$Sentiment)
prop.table(table(x$Sentiment))
myCorpusTwitter <- corpus(x)
summary(myCorpusTwitter)
str(myCorpusTwitter)

# generate 236 numbers without replacement to treat them as the training-set 
set.seed(123)
id_train <- sample(1:472, 236, replace = FALSE)
head(id_train, 10)

# create docvar with ID
docvars(myCorpusTwitter, "id_numeric") <- 1:ndoc(myCorpusTwitter)
summary(myCorpusTwitter)

# get training set (documents in id_train) and compute the dfm out of it
training_corpus1 <- corpus_subset(myCorpusTwitter, id_numeric %in% id_train)
table(training_corpus1$documents$Sentiment)
summary(training_corpus1 )
training_dfm1 <- dfm(training_corpus1  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
training_dfm1 <- dfm_trim(training_dfm1 , min_docfreq= 0.05)

# get test set (documents not in id_train) and compute the dfm out of it
training_corpus2 <- corpus_subset(myCorpusTwitter, !id_numeric %in% id_train)
summary(training_corpus2 )
table(training_corpus2$documents$Sentiment)
training_dfm2 <-  dfm(training_corpus2  , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")),
                remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, 
remove_separators=TRUE, remove_url = TRUE)
# Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...)
test_dfm <- dfm_trim(training_dfm2  , min_docfreq= 0.05)

# train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default)
nb <- textmodel_nb(training_dfm1, docvars(training_dfm1, "Sentiment"), distribution = c("multinomial"))
summary(nb)

# Let's make the features identical by passing training_dfm1 to dfm_select() as a pattern.
training_dfm2 <- dfm_select(training_dfm2, training_dfm1)

# Let’s inspect how well the classification worked
actual_class <- docvars(training_dfm2, "Sentiment")
predicted_class <- predict(nb, training_dfm2)
table(predicted_class)
prop.table(table(predicted_class ))

# to compare with iSA, let's add to the Bayes estimates for the test-set, the actual values of the training-set
x2 <- table(training_corpus1$documents$Sentiment)
y <- table( predicted_class )
BAYES <- x2 + y
BAYES
prop.table( BAYES )

##################
# iSA training set2 vs training set1
##################

set.seed(123)
outSent2 <- iSA(train2_ISA ,train1_ISA , D_train2_ISA) 
str(outSent2)
round(outSent2$btab, 5)

##################
# ReadMe training set2 vs training set1
##################

testset<- subset(trainingset2, split==TRUE)
trainingset<- subset(trainingset2, split==FALSE)
prop.table(table(trainingset$sentiment))
prop.table(table(testset$sentiment))

trainingset$TRAININGSET<-1
testset$TRAININGSET<-0

names(trainingset)[names(trainingset) == 'sentiment'] <- 'TRUTH'
names(testset)[names(testset) == 'sentiment'] <- 'TRUTH'

trainingset<- data.frame(TRAININGSET=trainingset$TRAININGSET,  subset(trainingset, select=-TRAININGSET))
trainingset<- data.frame(TRUTH=trainingset$TRUTH,  subset(trainingset, select=-TRUTH))
trainingset<- data.frame(FILENAME=trainingset$FILENAME,  subset(trainingset, select=-FILENAME))

testset<- data.frame(TRAININGSET=testset$TRAININGSET,  subset(testset, select=-TRAININGSET))
testset<- data.frame(TRUTH=testset$TRUTH,  subset(testset, select=-TRUTH))
testset<- data.frame(FILENAME=testset$FILENAME,  subset(testset, select=-FILENAME))

nrow(testset)
nrow(trainingset)

Out <- list(trainingset= trainingset, testset= testset)
set.seed(123)
Rmefull2 <- myRme(Out$trainingset, Out$testset, feat=8)
str(Rmefull2)
Rmefull2$est

# to compare with iSA, let's add to the ReadMe estimates for the test-set, the actual values of the training-set
x2 <-as.data.frame(table(D_train2_ISA))
str(x2)
colnames(x2)[2] <- "Real_Freq"
str(x2)

y2 <- as.data.frame(Rmefull2$est)
str(y2)
x2$ReadMeFreq <- round(y2$ReadMe*236,0)
x2
ReadMe2 <- x2

ReadMe2 $ReadMeFreqTot <- (ReadMe2 $Real_Freq+x2$ReadMeFreq)
prop.table( ReadMe2 $ReadMeFreqTot)

##################
# RandomForest training set2 vs training set1
##################

set.seed(123)
tweetRF2 <- randomForest(sentiment~ ., data=train2_RF, type="classification")
predictRF2 <- predict(tweetRF2, newdata=train1_RF)
prop.table(table( predictRF2))

# to compare with iSA, let's add to the RF estimates for the test-set, the actual values of the training-set
x22 <- table(train2_RF$sentiment)
y2 <- table( predictRF2)
RF2 <- x22 + y2
prop.table( RF2)

##################
# Naive Bayes training set2 vs training set1
##################

# train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default)
nb2 <- textmodel_nb(training_dfm2, docvars(training_dfm2, "Sentiment"), distribution = c("multinomial"))
summary(nb2)

# Let's make the features identical by passing training_dfm2 to dfm_select() as a pattern.
training_dfm1 <- dfm_select(training_dfm1, training_dfm2)

# Let’s inspect how well the classification worked
actual_class2 <- docvars(training_dfm1, "Sentiment")
predicted_class2 <- predict(nb2, training_dfm1)
table(predicted_class2)
prop.table(table(predicted_class2 ))

# to compare with iSA, let's add to the Bayes estimates for the test-set, the actual values of the training-set
x2 <- table(training_corpus2$documents$Sentiment)
y <- table( predicted_class2 )
BAYES2 <- x2 + y
BAYES2
prop.table( BAYES2 )

##################
# Estimating the MAE when training-set1 vs. training-set2
##################

mat_all <-cbind(outSent$btab[,"Estimate"], prop.table( ReadMe $ReadMeFreqTot), prop.table(RF),  prop.table(BAYES), prop.table(table(c(D_train1_ISA, D_train2_ISA))))
colnames(mat_all) <- c("iSA", "ReadMe", "RF", "BAYES", "True all")
round(mat_all,3)

mae_ISA <- mean(abs(mat_all[,1]-mat_all[,5]))
mae_RM <- mean(abs(mat_all[,2]-mat_all[,5]))
mae_RF <- mean(abs(mat_all[,3]-mat_all[,5]))
mae_BAYES <- mean(abs(mat_all[,4]-mat_all[,5]))

mae_ISA
mae_RM
mae_RF
mae_BAYES 

##################
# Estimating the MAE when training-set2 vs. training-set1
##################

mat_all2 <-cbind(outSent2$btab[,"Estimate"], prop.table( ReadMe2 $ReadMeFreqTot), prop.table(RF2),  prop.table(BAYES2), prop.table(table(c(D_train1_ISA, D_train2_ISA))))
colnames(mat_all2) <- c("iSA", "ReadMe", "RF", "BAYES", "True all")
round(mat_all2,3)

mae_ISA2 <- mean(abs(mat_all2[,1]-mat_all2[,5]))
mae_RM2 <- mean(abs(mat_all2[,2]-mat_all2[,5]))
mae_RF2 <- mean(abs(mat_all2[,3]-mat_all2[,5]))
mae_BAYES2 <- mean(abs(mat_all[,4]-mat_all[,5]))

mae_ISA2
mae_RM2
mae_RF2
mae_BAYES2 

####### Avg. MAE
(mae_ISA+mae_ISA2)/2
(mae_RM+mae_RM2)/2
(mae_RF+mae_RF2)/2
(mae_BAYES+mae_BAYES2)/2