#################################################### # classifiers with a general example #################################################### rm(list=ls(all=TRUE)) getwd() setwd("YOUR WORKING DIRECTORY") getwd() library(e1071) library(caTools) library(randomForest) data(iris) str(iris) iris$spl=sample.split(iris,SplitRatio=0.5) train=subset(iris, iris$spl==TRUE) test=subset(iris, iris$spl==FALSE) NB <- naiveBayes(Species ~ ., data=train) predictNB <- predict(NB, newdata=test, type="class") table( predictNB) table( predictNB, test$Species) # Accuracy (proportion of correctly classified documents): 0.96 (30+28+28) / nrow(test) set.seed(123) RF <- randomForest(Species ~ ., data=train, type="classification") predictRF <- predict(RF, newdata=test) table( predictRF) table( test$Species, predictRF) # Accuracy (proportion of correctly classified documents): 0.91 (30+28+24) / nrow(test) set.seed(123) SVM <- svm(Species ~ . ,data=train, method = "C-classification") predictSVM <-predict(SVM, newdata=test) table( predictSVM ) table( test$Species, predictSVM ) # Accuracy (proportion of correctly classified documents): 0.94 (30+29+26) / nrow(test) #################################################### # classifiers with texts #################################################### rm(list=ls(all=TRUE)) getwd() setwd("YOUR WORKING DIRECTORY") getwd() library(quanteda) library(readtext) library(caTools) library(randomForest) library(e1071) x10 <- read.csv("Trump_tweets1.csv", stringsAsFactors=FALSE) str(x10) myCorpusTwitter <- corpus(x10) # let's suppose that the sentiment classification via lexdict represents the "true" sentiment that # a set of human coders would have found lexdict <- dictionary(file = "C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL/LSDaug2015/LSD2015.lc3", format = "lexicoder") str(lexdict) trump3 <- dfm(myCorpusTwitter , remove = c(stopwords("english"), ("rt"), ("t.co") ), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, remove_twitter = TRUE, remove_url = TRUE , dictionary = lexdict ) trump3 str(trump3 ) df <- data.frame(trump3 ) str(df) head(df) df$sentiment_diff <- df$X.positive.008800-df$X.negative.AA0000 str(df) # we assume 0=neutral. Alternatively, we could have forced everythin between -1 and +1 to be neutral df$sentiment2[df$sentiment_diff==0] <- "Neutral" df$sentiment2[df$sentiment_diff<0] <- "Negative" df$sentiment2[df$sentiment_diff>0] <- "Positive" str(df) table(df$sentiment2) df$sentiment2 <- as.factor(df$sentiment2) str(df) # add the sentiment to the original set of tweets x10$sentiment <- df$sentiment2 str(x10) table(x10$sentiment) myCorpusTwitter <- corpus(x10) myDfm3 <- dfm(myCorpusTwitter , remove = c(stopwords("english"), ("rt") ,("t.co")), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE) topfeatures(myDfm3 , 20) # 20 top words # Keep terms that appear at least in the 1% or more of the tweets. trim <- dfm_trim(myDfm3 , min_docfreq= 0.01) data <- as.data.frame(as.matrix(trim)) str(data ) colnames(data ) # this is important: randomForest can't recognize the colname that begins with space, comma, number or other specific punctuation. # the command below would add a letter in front of a number (if you have any numbers left in the tdm). Highly suggested colnames(data ) <- make.names(colnames(data )) colnames(data ) summary(x10$sentiment) data$sentiment<- x10$sentiment str(sentiment) colnames(sentiment) summary(data$sentiment) # Build a training and testing set. Given that we are assuming that the sentiment classification via lexdict represents # a kind of "human coding" we are in effect running a kind of cross-classification procedure somehow! set.seed(123) split <- sample.split(data$sentiment, SplitRatio=0.6) table(split) trainSparse <- subset(data, split==TRUE) testSparse <- subset(data, split==FALSE) nrow(trainSparse ) nrow(testSparse ) str(trainSparse) str(trainSparse$sentiment) colnames(trainSparse) str(trainSparse$sentiment) # Try a random forest model. set.seed(123) tweetRF <- randomForest(sentiment~ ., data=trainSparse, type="classification") str(tweetRF) predictRF <- predict(tweetRF, newdata=testSparse) table( predictRF) table(testSparse$sentiment) table(testSparse$sentiment, predictRF) # Accuracy (proportion of correctly classified documents): 0.68 (162 + 58 + 53) / nrow(testSparse) # Precision for "Negative": number of documents correctly classified into category k, # divided by the total number of documents that the model classifies as category k: 0.75 (162/(162+38+15)) # Recall for "Negative": number of correctly classified category k documents divided by the number of human coded documents # in category k: 0.83 (162/(196)) # Try a SVM model set.seed(123) # If you ever get this warning message: # In svm.default(x, y, scale = scale, ..., na.action = na.action) : # Variable(s) ‘graham’ and ‘lindsey’ and ‘confirm’ constant. Cannot scale data. # it means that in the dtm in the training-set you have some words with always 0 across all texts # Either you drop such words and re-run the analysis or just write in the SVM command: "scale=FALSE" SV <- svm(sentiment ~ . ,data=trainSparse,method = "C-classification") predictSV <-predict(SV, newdata=testSparse) table( predictSV ) table(testSparse$sentiment) table(testSparse$sentiment, predictSV ) # Accuracy: 0.69 (181 + 49 + 44) / nrow(testSparse) # Precision for "Negative": 0.68 (181/(181+55+31)) # Recall for "Negative": 0.92 (181/(196)) ## Try a Naive Bayes model: problem with sparcity! NB <- naiveBayes(sentiment ~ ., data=trainSparse) predictNB <- predict(NB , newdata=testSparse, type="class") table( predictNB) table(testSparse$sentiment, predictNB) # Accuracy: 0.25 (3 + 5 + 91) / nrow(testSparse) #### replicate the entire analysis but now keep terms #### in the tdm that appear at least in the 5% or more of the tweets. #### Which are the main changes in terms of accuracy? #### replicate the entire analysis but now keep terms #### in the tdm that appear at least in the 1% or more of the tweets and #### reduces the sample to 0.3. #### Which are the main changes in terms of accuracy?