############ # split-half reliability for a dictionary, for example the syuzhet one ############ library(quanteda) library(syuzhet) library(cvTools) recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991) tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE) tok2 <- tokens_remove(tok2, stopwords("en")) # do not stem the texts! Mydfm <- dfm(tok2) x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english") str(x) names(x)[2] <- "sentiment" str(x) # let's recode the values to negative and positive x$sentiment <- ifelse(x$sentiment< 0, "negative", ifelse(x$sentiment> 0, "positive", "netural")) table(x$sentiment) library(cvTools) # let's split the dictorionary in 2 random folds set.seed(123) # set the see for replicability k <- 2 # the number of folds folds <- cvFolds(NROW(x), K=k) str(folds) dict1 <- x[folds$subsets[folds$which == 1], ] dict2 <- x[folds$subsets[folds$which == 2],] str(dict1 ) str(dict2 ) # now let's convert it to a Quanteda dictionary using the function as.dictionary dict1 <- as.dictionary(dict1 ) is.dictionary(dict1) # now let's convert it to a Quanteda dictionary using the function as.dictionary dict2<- as.dictionary(dict2) is.dictionary(dict2) ext_dfm1 <- dfm_lookup(Mydfm , dictionary = dict1) ext_dfm1 ext_dfm2 <- dfm_lookup(Mydfm , dictionary = dict2) ext_dfm2 Dictionary1 <-convert(ext_dfm1, to="data.frame") Dictionary2 <-convert(ext_dfm2, to="data.frame") str(Dictionary1) str(Dictionary2) Dictionary1$Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative) Dictionary2$Sentiment <- Dictionary2$positive/(Dictionary2$positive+Dictionary2$negative) str(Dictionary1) str(Dictionary2) r <- cor(Dictionary1$Sentiment, Dictionary2$Sentiment) r # Let's compute the Split-half reliability test via the Spearman-Brown prophecy formula: # a value closer to 1 and further from zero indicates greater internal consistency # Thus, in this case, the split-half reliability approach yields an internal consistency estimate of how much? (2 * r) / (1 + r) ################################################### ######## same thing as above but via a LOOP ################################################### library(cvTools) # let's split the dictorionary in 2 random folds set.seed(123) # set the see for replicability k <- 2 # the number of folds folds <- cvFolds(NROW(x), K=k) str(folds) n <- ndoc(recentCorpus) data <- data.frame(NA_col = rep(NA, n)) # Creating data containing NA data for(i in 1:k) { dict1 <- x[folds$subsets[folds$which == i], ] dict1 <- as.dictionary(dict1 ) ext_dfm1 <- dfm_lookup(Mydfm , dictionary = dict1) Dictionary1 <- convert(ext_dfm1, to="data.frame") Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative) data[ , i] <- Sentiment colnames(data)[i] <- paste0("Sentiment_", i) } data r <- cor(data[,1],data[,2]) r # Let's compute the Split-half reliability test via the Spearman-Brown prophecy formula: (2 * r) / (1 + r) ################################################### ######## via LOOP with 50 random draws - usually you should running the same loop with at least 500 random draws ################################################### n <- ndoc(recentCorpus) data <- data.frame(NA_col = rep(NA, n)) # Creating data containing NA data data2 <- data.frame(NA_row = rep(NA, 1)) # Creating data containing NA data2 # let's simulate 50 times split of 2 for(j in 1:50) { set.seed(j) k <- 2 folds <- cvFolds(NROW(x), K=k) for(i in 1:k) { dict1 <- x[folds$subsets[folds$which == i], ] dict1 <- as.dictionary(dict1 ) ext_dfm1 <- dfm_lookup(Mydfm , dictionary = dict1) Dictionary1 <- convert(ext_dfm1, to="data.frame") Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative) data[ , i] <- Sentiment } data2[j,] <- (2 * cor(data[,1],data[,2])) / (1 + cor(data[,1],data[,2])) } str(data2) mean(data2[,1]) sd(data2[,1]) data2$name <- "syuzhet" str(data2) library(ggplot2) ggplot(data2, aes(x=as.factor(name), y=NA_row)) + geom_boxplot(fill="slateblue", alpha=0.2) + xlab("Split-Half reliability (50 random draws)") + ylab("Split-Half reliability") library(hrbrthemes) library(viridis) ggplot(data2, aes(x=as.factor(name), y=NA_row)) + geom_boxplot()+ scale_fill_viridis(discrete = TRUE, alpha=0.6) + geom_jitter(color="black", size=0.4, alpha=0.9) + theme_ipsum() + theme( legend.position="none", plot.title = element_text(size=11) ) + ggtitle("A boxplot with jitter") + xlab("") ################################# ################################# # let's compare the results above with the ones we get via the nrc vocabulary ################################# x2<- get_sentiment_dictionary(dictionary = "nrc", language = "english") str(x2) table(x2$sentiment) x2 <- x2[ which(x2$sentiment=="positive" | x2$sentiment=="negative" ), ] str(x2) x2 <- x2[c(2:3)] str(x2) library(cvTools) # let's split the dictorionary in 2 random folds set.seed(123) # set the see for replicability k <- 2 # the number of folds folds <- cvFolds(NROW(x2), K=k) str(folds) n <- ndoc(recentCorpus) data <- data.frame(NA_col = rep(NA, n)) # Creating data containing NA data for(i in 1:k) { dict1 <- x2[folds$subsets[folds$which == i], ] dict1 <- as.dictionary(dict1 ) ext_dfm1 <- dfm_lookup(Mydfm , dictionary = dict1) Dictionary1 <- convert(ext_dfm1, to="data.frame") Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative) data[ , i] <- Sentiment colnames(data)[i] <- paste0("Sentiment_", i) } data r <- cor(data[,1],data[,2]) # Let's compute the Split-half reliability test via the Spearman-Brown prophecy formula: (2 * r) / (1 + r) ################################################### ######## via LOOP with 50 random draws ################################################### n <- ndoc(recentCorpus) data <- data.frame(NA_col = rep(NA, n)) # Creating data containing NA data data22 <- data.frame(NA_row = rep(NA, 1)) # Creating data containing NA data22 # let's simulate 50 times split of 2 for(j in 1:50) { set.seed(j) k <- 2 folds <- cvFolds(NROW(x), K=k) for(i in 1:k) { dict1 <- x2[folds$subsets[folds$which == i], ] dict1 <- as.dictionary(dict1 ) ext_dfm1 <- dfm_lookup(Mydfm , dictionary = dict1) Dictionary1 <- convert(ext_dfm1, to="data.frame") Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative) data[ , i] <- Sentiment } data22[j,] <- (2 * cor(data[,1],data[,2])) / (1 + cor(data[,1],data[,2])) } data22$name <- "nrc" str(data22) str(data2) mean(data2[,1]) mean(data22[,1]) dataOK <- rbind(data2, data22) str(dataOK) # let's compare the performance of the two sentiment dictionaries: in our case, better the syuzhet dictionary in terms of # internal consistency library(ggplot2) ggplot(dataOK , aes(x=as.factor(name), y=NA_row)) + geom_boxplot(fill="slateblue", alpha=0.2) + xlab("Split-Half reliability (50 random draws)") + ylab("Split-Half reliability")