rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(quanteda) library(readtext) library(caTools) library(e1071) library(randomForest) library(caret) library(iSAX) library(tm) ###################################################### ###################################################### # Let's learn how to estimate a Naive Bayes model ###################################################### ###################################################### ##################################################### # FIRST STEP: let's create the DfM for the training-set ##################################################### # This is a sample of 500 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018. # Such tweets have been codified as expressing a positive/negative/neutral sentiment towards Donald Trump. # We treat such tweets as our training-set. x11 <- read.csv("trainTrump.csv", stringsAsFactors=FALSE) str(x11) table(x11$Sentiment) prop.table(table(x11$Sentiment)) # Let's do some text preprocessing directly at this stage library(stringr) kwic(x11$text, "Trump's") # several tweets included the word "Trump's" while we want to keep just "Trump" x11$text <- str_replace_all(x11$text, "[^[:alnum:]]", " ") kwic(x11$text, "Trump's") # solved the issue! myCorpusTwitterTrain <- corpus(x11) head(summary( myCorpusTwitterTrain )) Dfm_train <- dfm(myCorpusTwitterTrain , remove = c(stopwords("english")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(Dfm_train , 20) # 20 top words # some problems here. Let's clean the DfM Dfm_train<- dfm(myCorpusTwitterTrain , remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(Dfm_train , 20) # 20 top words # Let's trim the dfm in order to keep only tokens that appear in 2 or more tweets (tweets are very short texts...) Dfm_train <- dfm_trim(Dfm_train , min_docfreq = 2, verbose=TRUE) ##################################################### # SECOND STEP: let's create the DfM for the test-set ##################################################### # This is a sample of 500 tweets written in English about Trump and published since 1.17.2018 till 1.19.2018 # that we treat as our test set. x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE) str(x10) # removing all punctuations also here kwic(x10$text, "Trump's") x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ") kwic(x10$text, "Trump's") myCorpusTwitterTest <- corpus(x10) head(summary( myCorpusTwitterTest)) Dfm_test<- dfm(myCorpusTwitterTest, remove = c(stopwords("english"), ("amp"), ("rt") ,("tco"), ("co"), ("u"), ("t"), ("s"), ("ed"), ("https"), ("â"), ("com"), ("ly")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_symbols=TRUE, remove_twitter = TRUE, remove_separators=TRUE, remove_url = TRUE) topfeatures(Dfm_test, 20) # 20 top words Dfm_test<- dfm_trim(Dfm_test, min_docfreq = 2, verbose=TRUE) ##################################################### # THIRD STEP: Let's make the features identical between train and test-set by passing Dfm_train to dfm_match() as a pattern. # This is needed cause Naive Bayes in Quanteda can only take features into consideration that occur both in the training set and the test set ##################################################### str(Dfm_train) length(Dfm_train@Dimnames$features) # 939 features str(Dfm_test) length(Dfm_test@Dimnames$features) # 912 features test_dfm <- dfm_match(Dfm_test, features = featnames(Dfm_train)) length(test_dfm@Dimnames$features) # 939 features ##################################################### # FOURTH STEP: Let's run a Naive Bayes model in Quanteda [actually at the moment the best way to run a Naive Bayes model on texts out there] ##################################################### # a) train the Naive Bayes classifier using textmodel_nb() and a Multinomial distribution (the default) head(docvars(Dfm_train)) system.time(nb <- textmodel_nb(Dfm_train, docvars(Dfm_train, "Sentiment"), distribution = c("multinomial"))) # very fast! summary(nb) # b) predict the test-set predicted_nb <- predict(nb, test_dfm) table(predicted_nb ) prop.table(table(predicted_nb )) # compare with the % in the train-set prop.table(table(Dfm_train@docvars$Sentiment)) ###################################################### ###################################################### # Let's learn how to estimate a Random Forest model ###################################################### ###################################################### # KEEP steps 1-3 from above (if you start from scratch, you should repeat them!) ##################################################### # FOURTH STEP/B: transform both dfm (training and test set) in a data frame ##################################################### train <- as.data.frame(as.matrix(Dfm_train)) test <- as.data.frame(as.matrix(test_dfm)) # this is important: randomForest can't recognize the colname that begins with space, comma, number or other specific punctuation. # the command below would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested colnames(train ) <- make.names(colnames(train )) colnames(test ) <- make.names(colnames(test )) ##################################################### # FIFTH STEP/B: let's run the Random Forest ##################################################### # a) train the RF classifier (define a set.seed for being able to replicate the results!) # The main deafault hyperparameters in the case of a RF are the following ones: ntree=500 (Number of trees to grow. Here we set ntree=100 to save time), # mtry=sqrt(p)(Number of variables randomly sampled as candidates at each split, # where p is number of variables in x. In our case x=length(train)=939! therefore sqrt(length(train))=30.64) # nodesize=1 (Minimum size of terminal nodes. Setting this number larger causes smaller trees to be grown (and thus take less time)) # Note that your output variable should be a factor or a numeric value, not a character! Here I threefore transform the Sentiment variable # in a factor via the command as.factor given that I want to run a classification model! str(Dfm_train@docvars$Sentiment) # that's a character variable! Not good! set.seed(123) # (define a set.seed for being able to replicate the results!) system.time(RF <- randomForest(y= as.factor(Dfm_train@docvars$Sentiment), x=train, importance=TRUE, ntree=100, do.trace=TRUE)) # The graph below shows what will happen to the predictive power of your model if you drop some variables. # The variables with the highest importance scores are the ones that give the best prediction and contribute most to the model. # On the left graph: what will happen to the MSE if you drop that variable # On the right graph: a somehow similar result with respect to what will happen before and after the split on that variable. varImpPlot(RF ) # b) predict the test-set system.time(predicted_rf <- predict(RF, test,type="class")) table(predicted_rf) prop.table(table(predicted_rf )) # compare with the % in the train-set prop.table(table(Dfm_train@docvars$Sentiment)) ###################################################### ###################################################### # Let's learn how to estimate a SVM model ###################################################### ###################################################### # KEEP steps 1-3 from above (if you start from scratch, you should repeat them!) # KEEP also steps FOURTH STEP/B from above (if you start from scratch, you should repeat them!) ##################################################### # FIFTH STEP/C: let's run a SVM model ##################################################### # a) train the SVM classifier # note that here I select a linear kernel and, as hyperameter, a specific value for the cost C(=1) # other main hypermaters for a linear kernel are: epsilon (epsilon in the insensitive-loss function (default: 0.1)) # for all other kernel (radial, polynomial), you also have gamma (default: 1/(data dimension)) # for polynomial kernel you also have degree (default: 3) and coef0 (default: 0) set.seed(123)# (define a set.seed for being able to replicate the results!) system.time(SV <- svm(y= as.factor(Dfm_train@docvars$Sentiment), x=train, kernel='linear', cost = 1)) # how many supporting vectors? length(SV$index) # 373 texts out of 500 documents nrow(train) # 500 texts in the train data frame # these are the indices of the supporting vectors SV$index # those are the first 6 vectors with the corresponding values for each features included in them head(SV$SV) # why you get such strange vales? Cause by default SVM rescale all the values to zero mean and unit variance. # The center and scale values are returned and used for later predictions. # and indeed, if you add to the estimation scale=TRUE - but do not do that on your analysis! SV2 <- svm(y= as.factor(Dfm_train@docvars$Sentiment), x=train, kernel='linear', cost = 1, scale=FALSE) head(SV2$SV) # let's read those observations (i.e., documents) that are more “important” or “separate” better the data str(x11) vectors <- x11[SV$index,] head(x11$text) head(vectors$text) # you see that the [3] to [6] documents in the training-set, for example, are not vectors # b) predict the test-set system.time(predicted_svm <- predict(SV , test)) table(predicted_svm ) prop.table(table(predicted_svm )) # compare with the % in the train-set prop.table(table(Dfm_train@docvars$Sentiment)) ##################################################### # Let's compare the three results we got with NB, RF and SVM: ##################################################### prop.table(table(predicted_nb )) prop.table(table(predicted_rf )) prop.table(table(predicted_svm )) # there is some difference! Therefore, which one to "trust" more? # the ANSWER: do a Cross-Validation!!!! ###################################################### ###################################################### # Let's learn how to estimate a proportional model via iSAX ###################################################### ###################################################### ### STEP 1/D: TRAINING-TEST # Let's start with our usual training-set about Trump x <- read.csv("trainTrump.csv", stringsAsFactors=FALSE) x$text <- str_replace_all(x$text, "[^[:alnum:]]", " ") x$Sentiment <- as.factor(x$Sentiment) # let's transform the variable "Sentiment" in a factor variable prop.table(table(x$Sentiment)) ### STEP 2/D: TEST-SET ### That's our usual test-set x10 <- read.csv("testTrump.csv", stringsAsFactors=FALSE) x10$text <- str_replace_all(x10$text, "[^[:alnum:]]", " ") ### STEP 3/D: Let's create a unique dataset including both TEST and TRAINING SET x10$Sentiment <- NA # for doing that let's add a new column called "Sentiment" in the test-set given that such column is presented in the training-set documents <- rbind(x10, x) # let's combine the test and the training-set str(documents ) # we have 1000 texts (500 as test-set and 500 as training-set) prop.table(table(documents $Sentiment)) ### STEP 4/D: iSA needs the TM package (not Quanteda!) to build the DfM corpus <- VCorpus(VectorSource(documents $text)) # let's build the corpus str(corpus[[1]]) ocome <- prep.data(corpus,verbose=TRUE, th=0.995) # let's prepare data for iSA algorithm. # This is a pre-processing step which performs stemming and other cleaning steps, as well as producing the DfM. # th=0.995 means that we drop those features that appear in less than 5% of the texts ### STEP 5/D: let's separate the resulting object "ocome" according to the presence or absence of info about the Sentiment ### (i.e., training vs. test-set) train <- !is.na(documents $Sentiment) # I create an index=TRUE for the training-set documents (i.e., those texts with Sentiment # different than NA) train summary(train) D <- documents$Sentiment[train] # I recover the vector of the values for the Sentiment in the training-set str(D) # Same results indeed! prop.table(table(D)) prop.table(table(x$Sentiment)) Strain <- ocome$S[which(train)] # I select out of "ocome" the vector of stems belonging to the training-set Stest <- ocome$S[-which(train)] # I select out of "ocome" the vector of stems belonging to the test-set length(Strain ) # 500! length(Stest) # 500! ### STEP 6/D: let's run the proporional algorithm set.seed(123) system.time(outSent <- iSA(Strain ,Stest , D)) # D is the vector of codings belonging to the training set ### STEP 7/D: let's classify the test-test round(outSent$btab, 5) # I have also bootstrapped s.e.! ##################################################### # Let's compare the three results we got with NB, RF and SVM: ##################################################### prop.table(table(predicted_nb )) prop.table(table(predicted_rf )) prop.table(table(predicted_svm )) outSent$btab[1:3] # there is some difference! Therefore, which one to trust more? # once again the ANSWER: do a Cross-Validation!!!!