rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL") getwd() library(e1071) library(caTools) library(randomForest) library(caret) library(quanteda) library(readtext) library(iSAX) library(ReadMe) ################################################## # iSA ################################################## ### TRAIN-TEST # This dataset is a sample of tweets mentioning the official account of Donald Trump “@realDonaldTrump”, # on dates 7–13 June 2016, written in English and coming from the US. # Data have been collected through Twitter API also specifying language and origin of tweets. # This dataset include a sample of around 482 tweets that have been manually codified by a group # of students. The coding stage involved detecting the sentiment towards Trump (negative, positive, # neutral). x <- read.csv("Trump-orig3.csv", stringsAsFactors=FALSE) str(x) table(x$Sentiment) x$Sentiment <- as.factor(x$Sentiment) str(x) table(x$Sentiment) prop.table(table(x$Sentiment)) ### TEST-SET ### This is a sample of 1000 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018 x10 <- read.csv("Trump_tweets2.csv", stringsAsFactors=FALSE) str(x10) x10$X <- NULL str(x10) ### Let's create a unique dataset including both TEST and TRAINING SET x10$Sentiment <- NA str(x10) documents <- rbind(x10, x) str(documents ) table(documents $Sentiment) prop.table(table(documents $Sentiment)) ### iSA needs the TM package (not Quanteda!) to build the DFM corpus <- VCorpus(VectorSource(documents $text)) length(corpus) str(corpus[[1]]) ocome <- prep.data(corpus,verbose=TRUE, th=0.995) str(ocome) ### let's separate the two data frames according to the presence or absence of info about the Sentiment (i.e., training vs. test-set) train <- !is.na(documents $Sentiment) summary(train) table(documents $Sentiment) table(train) D <- documents $Sentiment[train] prop.table(table(D)) str(D) Strain <- ocome$S[which(train)] Stest <- ocome$S[-which(train)] set.seed(123) outSent <- iSA(Strain ,Stest , D) round(outSent$btab, 5) # training-set % prop.table(table(D )) ### remember the results I got from the NB model # negative neutral positive # 0.461 0.326 0.213 ################################################## # ReadMe ################################################## # I recover from the ocome object the DTM str(ocome) dtm <- ocome$dtm dtm1 <- data.frame(dtm) head(str(dtm1 )) documents $FILENAME= 1:nrow(documents ) dtm1$sentiment<- as.factor(documents $Sentiment) dtm1$FILENAME<- as.factor(documents $FILENAME) summary(dtm1$sentiment) trainingset<- dtm1[ which(!is.na(dtm1$sentiment)), ] testset<- dtm1[ which(is.na(dtm1$sentiment)), ] nrow(trainingset) nrow(testset) prop.table(table(trainingset$sentiment)) trainingset$TRAININGSET<-1 testset$TRAININGSET<-0 names(trainingset)[names(trainingset) == 'sentiment'] <- 'TRUTH' names(testset)[names(testset) == 'sentiment'] <- 'TRUTH' trainingset<- data.frame(TRAININGSET=trainingset$TRAININGSET, subset(trainingset, select=-TRAININGSET)) trainingset<- data.frame(TRUTH=trainingset$TRUTH, subset(trainingset, select=-TRUTH)) trainingset<- data.frame(FILENAME=trainingset$FILENAME, subset(trainingset, select=-FILENAME)) testset<- data.frame(TRAININGSET=testset$TRAININGSET, subset(testset, select=-TRAININGSET)) testset<- data.frame(TRUTH=testset$TRUTH, subset(testset, select=-TRUTH)) testset<- data.frame(FILENAME=testset$FILENAME, subset(testset, select=-FILENAME)) nrow(testset) nrow(trainingset) Out <- list(trainingset= trainingset, testset= testset) myRme <- function(trainset, testset, feat=NULL){ ptm <- proc.time() cat("\nReadme...\n") require(ReadMe) testset$TRAININGSET <- 0 Out<-list(trainingset =trainset, testset =testset) Out$features <- 16 Out$n.subset <- 300 Out$prob.wt <- 1 Out$boot.se <- FALSE Out$nboot <- 300 Out$printit <- TRUE n <- length(table(trainset$TRUTH)) Out$formula <- as.formula( paste( paste(colnames(trainset)[-(1:3)], collapse= "+"), "~ TRUTH ") ) # print(str(Out)) Out <- preprocess(Out) tmp <- try(capture.output( readme(Out, features=feat) -> aa ), TRUE) if(class(tmp) == "try-error") { return(list(est=matrix(NA,n,1),time=NA)) } # tmp <- rme(Out, features=6) -> aa tmp <- matrix(aa$est, length(aa$est),1) rownames(tmp) <- names(aa$est) colnames(tmp) <- "ReadMe" etime <- (proc.time()-ptm)[1] cat(sprintf("\nElapsed time: %.2f seconds\n",etime)) return(list(est=tmp,time=etime)) } set.seed(123) Rmefull <- myRme(Out$trainingset, Out$testset, feat=8) str(Rmefull) Rmefull$est round(Rmefull$est, 5) round(outSent$btab, 5) ################################################## # How to assess the quality of iSA and ReadMe estimates? ################################################## # Create two training-sets from the training-set you have, and estimate a MAE on it! ################## # iSA training set1 vs training set2 ################## # create two training-sets from the original one set.seed(123) split <- sample.split(Strain, SplitRatio=0.5) table(split) train1_ISA <- subset(Strain, split==TRUE) train2_ISA <- subset(Strain, split==FALSE) length(train1_ISA) length(train2_ISA) D_train1_ISA <-subset(D, split==TRUE) prop.table(table(D_train1_ISA)) D_train2_ISA <-subset(D, split==FALSE) prop.table(table(D_train2_ISA)) set.seed(123) outSent <- iSA(train1_ISA ,train2_ISA , D_train1_ISA) str(outSent) round(outSent$btab, 5) ################## # ReadMe training set1 vs training set2 ################## # I recover from the ocome object the DTM str(ocome) dtm <- ocome$dtm dtm1 <- data.frame(dtm) head(str(dtm1 )) documents $FILENAME= 1:nrow(documents ) dtm1$sentiment<- as.factor(documents $Sentiment) dtm1$FILENAME<- as.factor(documents $FILENAME) summary(dtm1$sentiment) trainingset2<- dtm1[ which(!is.na(dtm1$sentiment)), ] testset2<- dtm1[ which(is.na(dtm1$sentiment)), ] nrow(trainingset2) nrow(testset2) trainingset<- subset(trainingset2, split==TRUE) testset<- subset(trainingset2, split==FALSE) prop.table(table(trainingset$sentiment)) prop.table(table(testset$sentiment)) trainingset$TRAININGSET<-1 testset$TRAININGSET<-0 names(trainingset)[names(trainingset) == 'sentiment'] <- 'TRUTH' names(testset)[names(testset) == 'sentiment'] <- 'TRUTH' trainingset<- data.frame(TRAININGSET=trainingset$TRAININGSET, subset(trainingset, select=-TRAININGSET)) trainingset<- data.frame(TRUTH=trainingset$TRUTH, subset(trainingset, select=-TRUTH)) trainingset<- data.frame(FILENAME=trainingset$FILENAME, subset(trainingset, select=-FILENAME)) testset<- data.frame(TRAININGSET=testset$TRAININGSET, subset(testset, select=-TRAININGSET)) testset<- data.frame(TRUTH=testset$TRUTH, subset(testset, select=-TRUTH)) testset<- data.frame(FILENAME=testset$FILENAME, subset(testset, select=-FILENAME)) nrow(testset) nrow(trainingset) Out <- list(trainingset= trainingset, testset= testset) myRme <- function(trainset, testset, feat=NULL){ ptm <- proc.time() cat("\nReadme...\n") require(ReadMe) testset$TRAININGSET <- 0 Out<-list(trainingset =trainset, testset =testset) Out$features <- 16 Out$n.subset <- 300 Out$prob.wt <- 1 Out$boot.se <- FALSE Out$nboot <- 300 Out$printit <- TRUE n <- length(table(trainset$TRUTH)) Out$formula <- as.formula( paste( paste(colnames(trainset)[-(1:3)], collapse= "+"), "~ TRUTH ") ) # print(str(Out)) Out <- preprocess(Out) tmp <- try(capture.output( readme(Out, features=feat) -> aa ), TRUE) if(class(tmp) == "try-error") { return(list(est=matrix(NA,n,1),time=NA)) } # tmp <- rme(Out, features=6) -> aa tmp <- matrix(aa$est, length(aa$est),1) rownames(tmp) <- names(aa$est) colnames(tmp) <- "ReadMe" etime <- (proc.time()-ptm)[1] cat(sprintf("\nElapsed time: %.2f seconds\n",etime)) return(list(est=tmp,time=etime)) } set.seed(123) Rmefull <- myRme(Out$trainingset, Out$testset, feat=8) str(Rmefull) Rmefull$est round(Rmefull$est, 5) # to compare with iSA, let's add to the ReadMe estimates for the test-set, the actual values of the training-set x2 <-as.data.frame(table(D_train1_ISA)) str(x2) colnames(x2)[2] <- "Real_Freq" str(x2) y2 <- as.data.frame(Rmefull$est) str(y2) x2$ReadMeFreq <- round(y2$ReadMe*236,0) x2 ReadMe <- x2 ReadMe $ReadMeFreqTot <- (ReadMe $Real_Freq+x2$ReadMeFreq) prop.table( ReadMe $ReadMeFreqTot) ################## # RandomForest training set1 vs training set2 ################## dtm <- ocome$dtm dtm1 <- data.frame(dtm) # this would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested colnames(dtm1) <- make.names(colnames(dtm1)) table(documents $Sentiment) dtm1$sentiment<- as.factor(documents $Sentiment) summary(dtm1$sentiment) train <- dtm1[ which(!is.na(dtm1$sentiment)), ] test <- dtm1[ which(is.na(dtm1$sentiment)), ] nrow(train) nrow(test) train1_RF <- subset(train , split==TRUE) train2_RF <- subset(train , split==FALSE) prop.table(table(train1_RF$sentiment)) prop.table(table(train2_RF$sentiment)) # Try a random forest model: train vs. test set.seed(123) tweetRF <- randomForest(sentiment~ ., data=train1_RF, type="classification") predictRF <- predict(tweetRF, newdata=train2_RF) prop.table(table( predictRF)) # to compare with iSA, let's add to the RF estimates for the test-set, the actual values of the training-set x2 <- table(train1_RF$sentiment) y <- table( predictRF ) RF <- x2 + y prop.table( RF) ################## # Naive Bayes training set1 vs training set2 ################## str(x) table(x$Sentiment) prop.table(table(x$Sentiment)) myCorpusTwitter <- corpus(x) summary(myCorpusTwitter) str(myCorpusTwitter) # generate 236 numbers without replacement to treat them as the training-set set.seed(123) id_train <- sample(1:472, 236, replace = FALSE) head(id_train, 10) # create docvar with ID docvars(myCorpusTwitter, "id_numeric") <- 1:ndoc(myCorpusTwitter) summary(myCorpusTwitter) # get training set (documents in id_train) and compute the dfm out of it training_corpus1 <- corpus_subset(myCorpusTwitter, id_numeric %in% id_train) table(training_corpus1$documents$Sentiment) summary(training_corpus1 ) training_dfm1 <- dfm(training_corpus1 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) training_dfm1 <- dfm_trim(training_dfm1 , min_docfreq= 0.05) # get test set (documents not in id_train) and compute the dfm out of it training_corpus2 <- corpus_subset(myCorpusTwitter, !id_numeric %in% id_train) summary(training_corpus2 ) table(training_corpus2$documents$Sentiment) training_dfm2 <- dfm(training_corpus2 , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) # Keep terms that appear at least in the 5% or more of the tweets (tweets are very short texts...) test_dfm <- dfm_trim(training_dfm2 , min_docfreq= 0.05) # train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default) nb <- textmodel_nb(training_dfm1, docvars(training_dfm1, "Sentiment"), distribution = c("multinomial")) summary(nb) # Let's make the features identical by passing training_dfm1 to dfm_select() as a pattern. training_dfm2 <- dfm_select(training_dfm2, training_dfm1) # Let’s inspect how well the classification worked actual_class <- docvars(training_dfm2, "Sentiment") predicted_class <- predict(nb, training_dfm2) table(predicted_class) prop.table(table(predicted_class )) # to compare with iSA, let's add to the Bayes estimates for the test-set, the actual values of the training-set x2 <- table(training_corpus1$documents$Sentiment) y <- table( predicted_class ) BAYES <- x2 + y BAYES prop.table( BAYES ) ################## # iSA training set2 vs training set1 ################## set.seed(123) outSent2 <- iSA(train2_ISA ,train1_ISA , D_train2_ISA) str(outSent2) round(outSent2$btab, 5) ################## # ReadMe training set2 vs training set1 ################## testset<- subset(trainingset2, split==TRUE) trainingset<- subset(trainingset2, split==FALSE) prop.table(table(trainingset$sentiment)) prop.table(table(testset$sentiment)) trainingset$TRAININGSET<-1 testset$TRAININGSET<-0 names(trainingset)[names(trainingset) == 'sentiment'] <- 'TRUTH' names(testset)[names(testset) == 'sentiment'] <- 'TRUTH' trainingset<- data.frame(TRAININGSET=trainingset$TRAININGSET, subset(trainingset, select=-TRAININGSET)) trainingset<- data.frame(TRUTH=trainingset$TRUTH, subset(trainingset, select=-TRUTH)) trainingset<- data.frame(FILENAME=trainingset$FILENAME, subset(trainingset, select=-FILENAME)) testset<- data.frame(TRAININGSET=testset$TRAININGSET, subset(testset, select=-TRAININGSET)) testset<- data.frame(TRUTH=testset$TRUTH, subset(testset, select=-TRUTH)) testset<- data.frame(FILENAME=testset$FILENAME, subset(testset, select=-FILENAME)) nrow(testset) nrow(trainingset) Out <- list(trainingset= trainingset, testset= testset) set.seed(123) Rmefull2 <- myRme(Out$trainingset, Out$testset, feat=8) str(Rmefull2) Rmefull2$est # to compare with iSA, let's add to the ReadMe estimates for the test-set, the actual values of the training-set x2 <-as.data.frame(table(D_train2_ISA)) str(x2) colnames(x2)[2] <- "Real_Freq" str(x2) y2 <- as.data.frame(Rmefull2$est) str(y2) x2$ReadMeFreq <- round(y2$ReadMe*236,0) x2 ReadMe2 <- x2 ReadMe2 $ReadMeFreqTot <- (ReadMe2 $Real_Freq+x2$ReadMeFreq) prop.table( ReadMe2 $ReadMeFreqTot) ################## # RandomForest training set2 vs training set1 ################## set.seed(123) tweetRF2 <- randomForest(sentiment~ ., data=train2_RF, type="classification") predictRF2 <- predict(tweetRF2, newdata=train1_RF) prop.table(table( predictRF2)) # to compare with iSA, let's add to the RF estimates for the test-set, the actual values of the training-set x22 <- table(train2_RF$sentiment) y2 <- table( predictRF2) RF2 <- x22 + y2 prop.table( RF2) ################## # Naive Bayes training set2 vs training set1 ################## # train the naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default) nb2 <- textmodel_nb(training_dfm2, docvars(training_dfm2, "Sentiment"), distribution = c("multinomial")) summary(nb2) # Let's make the features identical by passing training_dfm2 to dfm_select() as a pattern. training_dfm1 <- dfm_select(training_dfm1, training_dfm2) # Let’s inspect how well the classification worked actual_class2 <- docvars(training_dfm1, "Sentiment") predicted_class2 <- predict(nb2, training_dfm1) table(predicted_class2) prop.table(table(predicted_class2 )) # to compare with iSA, let's add to the Bayes estimates for the test-set, the actual values of the training-set x2 <- table(training_corpus2$documents$Sentiment) y <- table( predicted_class2 ) BAYES2 <- x2 + y BAYES2 prop.table( BAYES2 ) ################## # Estimating the MAE when training-set1 vs. training-set2 ################## mat_all <-cbind(outSent$btab[,"Estimate"], prop.table( ReadMe $ReadMeFreqTot), prop.table(RF), prop.table(BAYES), prop.table(table(c(D_train1_ISA, D_train2_ISA)))) colnames(mat_all) <- c("iSA", "ReadMe", "RF", "BAYES", "True all") round(mat_all,3) mae_ISA <- mean(abs(mat_all[,1]-mat_all[,5])) mae_RM <- mean(abs(mat_all[,2]-mat_all[,5])) mae_RF <- mean(abs(mat_all[,3]-mat_all[,5])) mae_BAYES <- mean(abs(mat_all[,4]-mat_all[,5])) mae_ISA mae_RM mae_RF mae_BAYES ################## # Estimating the MAE when training-set2 vs. training-set1 ################## mat_all2 <-cbind(outSent2$btab[,"Estimate"], prop.table( ReadMe2 $ReadMeFreqTot), prop.table(RF2), prop.table(BAYES2), prop.table(table(c(D_train1_ISA, D_train2_ISA)))) colnames(mat_all2) <- c("iSA", "ReadMe", "RF", "BAYES", "True all") round(mat_all2,3) mae_ISA2 <- mean(abs(mat_all2[,1]-mat_all2[,5])) mae_RM2 <- mean(abs(mat_all2[,2]-mat_all2[,5])) mae_RF2 <- mean(abs(mat_all2[,3]-mat_all2[,5])) mae_BAYES2 <- mean(abs(mat_all[,4]-mat_all[,5])) mae_ISA2 mae_RM2 mae_RF2 mae_BAYES2 ####### Avg. MAE (mae_ISA+mae_ISA2)/2 (mae_RM+mae_RM2)/2 (mae_RF+mae_RF2)/2 (mae_BAYES+mae_BAYES2)/2