############
# split-half reliability for a dictionary, for example the syuzhet one
############

library(quanteda)
library(syuzhet)
library(cvTools)

recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991)
tok2 <- tokens(recentCorpus, remove_punct = TRUE, remove_numbers=TRUE, remove_symbols = TRUE, split_hyphens = TRUE, remove_separators = TRUE)
tok2 <- tokens_remove(tok2, stopwords("en"))
# do not stem the texts!
Mydfm <- dfm(tok2)

x <- get_sentiment_dictionary(dictionary = "syuzhet", language = "english")
str(x)
names(x)[2] <- "sentiment"
str(x)

# let's recode the values to negative and positive
x$sentiment <- ifelse(x$sentiment< 0, "negative", ifelse(x$sentiment> 0, "positive", "netural"))
table(x$sentiment)

library(cvTools)
# let's split the dictorionary in 2 random folds 
set.seed(123) # set the see for replicability
k <- 2 # the number of folds
folds <- cvFolds(NROW(x), K=k)
str(folds)

dict1 <- x[folds$subsets[folds$which == 1], ]
dict2 <- x[folds$subsets[folds$which == 2],]

str(dict1 )
str(dict2 )

# now let's convert it to a Quanteda dictionary using the function as.dictionary
dict1 <- as.dictionary(dict1 )
is.dictionary(dict1)

# now let's convert it to a Quanteda dictionary using the function as.dictionary
dict2<- as.dictionary(dict2)
is.dictionary(dict2)

ext_dfm1 <- dfm_lookup(Mydfm  , dictionary = dict1)
ext_dfm1

ext_dfm2 <- dfm_lookup(Mydfm  , dictionary = dict2)
ext_dfm2

Dictionary1 <-convert(ext_dfm1, to="data.frame")
Dictionary2 <-convert(ext_dfm2, to="data.frame")

str(Dictionary1)
str(Dictionary2)

Dictionary1$Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative)
Dictionary2$Sentiment  <- Dictionary2$positive/(Dictionary2$positive+Dictionary2$negative)

str(Dictionary1)
str(Dictionary2)

r <- cor(Dictionary1$Sentiment, Dictionary2$Sentiment)
r

# Let's compute the Split-half reliability test via the Spearman-Brown prophecy formula: 
# a value closer to 1 and further from zero indicates greater internal consistency
# Thus, in this case, the split-half reliability approach yields an internal consistency estimate of how much?
(2 * r) / (1 + r)

###################################################
######## same thing as above but via a LOOP
###################################################

library(cvTools)
# let's split the dictorionary in 2 random folds 
set.seed(123) # set the see for replicability
k <- 2 # the number of folds
folds <- cvFolds(NROW(x), K=k)
str(folds)

n <- ndoc(recentCorpus)
data <- data.frame(NA_col = rep(NA, n))   # Creating data containing NA
data  

for(i in 1:k) { 
dict1 <- x[folds$subsets[folds$which == i], ]
dict1 <- as.dictionary(dict1 )
ext_dfm1 <- dfm_lookup(Mydfm  , dictionary = dict1)
Dictionary1 <- convert(ext_dfm1, to="data.frame")
Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative)
data[ , i] <- Sentiment 
colnames(data)[i] <- paste0("Sentiment_", i)   
}

data

r <- cor(data[,1],data[,2])
r
# Let's compute the Split-half reliability test via the Spearman-Brown prophecy formula: 
(2 * r) / (1 + r)

###################################################
######## via LOOP with 50 random draws - usually you should running the same loop with at least 500 random draws
###################################################

n <- ndoc(recentCorpus)
data <- data.frame(NA_col = rep(NA, n))     # Creating data containing NA
data  

data2 <- data.frame(NA_row = rep(NA, 1))     # Creating data containing NA
data2 

# let's simulate 50 times split of 2
for(j in 1:50) { 
set.seed(j)
k <- 2 
folds <- cvFolds(NROW(x), K=k)
for(i in 1:k) { 
dict1 <- x[folds$subsets[folds$which == i], ]
dict1 <- as.dictionary(dict1 )
ext_dfm1 <- dfm_lookup(Mydfm  , dictionary = dict1)
Dictionary1 <- convert(ext_dfm1, to="data.frame")
Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative)
data[ , i] <- Sentiment
}
data2[j,] <- (2 * cor(data[,1],data[,2])) / (1 + cor(data[,1],data[,2]))
}

str(data2)
mean(data2[,1])
sd(data2[,1])
data2$name <- "syuzhet"
str(data2)

library(ggplot2)
ggplot(data2, aes(x=as.factor(name), y=NA_row)) + 
    geom_boxplot(fill="slateblue", alpha=0.2) + 
    xlab("Split-Half reliability (50 random draws)") +  ylab("Split-Half reliability")

library(hrbrthemes)
library(viridis)

ggplot(data2, aes(x=as.factor(name), y=NA_row)) + 
    geom_boxplot()+
    scale_fill_viridis(discrete = TRUE, alpha=0.6) +
    geom_jitter(color="black", size=0.4, alpha=0.9) +
    theme_ipsum() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    ggtitle("A boxplot with jitter") +
    xlab("")

#################################

#################################
# let's compare the results above with the ones we get via the nrc vocabulary
#################################

x2<- get_sentiment_dictionary(dictionary = "nrc", language = "english")
str(x2)
table(x2$sentiment)

x2 <- x2[ which(x2$sentiment=="positive" | x2$sentiment=="negative" ), ]
str(x2)
x2 <- x2[c(2:3)]
str(x2)

library(cvTools)
# let's split the dictorionary in 2 random folds 
set.seed(123) # set the see for replicability
k <- 2 # the number of folds
folds <- cvFolds(NROW(x2), K=k)
str(folds)

n <- ndoc(recentCorpus)
data <- data.frame(NA_col = rep(NA, n))   # Creating data containing NA
data  

for(i in 1:k) { 
dict1 <- x2[folds$subsets[folds$which == i], ]
dict1 <- as.dictionary(dict1 )
ext_dfm1 <- dfm_lookup(Mydfm  , dictionary = dict1)
Dictionary1 <- convert(ext_dfm1, to="data.frame")
Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative)
data[ , i] <- Sentiment 
colnames(data)[i] <- paste0("Sentiment_", i)   
}

data

r <- cor(data[,1],data[,2])
# Let's compute the Split-half reliability test via the Spearman-Brown prophecy formula: 
(2 * r) / (1 + r)

###################################################
######## via LOOP with 50 random draws
###################################################

n <- ndoc(recentCorpus)
data <- data.frame(NA_col = rep(NA, n))     # Creating data containing NA
data  

data22 <- data.frame(NA_row = rep(NA, 1))     # Creating data containing NA
data22 

# let's simulate 50 times split of 2
for(j in 1:50) { 
set.seed(j)
k <- 2 
folds <- cvFolds(NROW(x), K=k)
for(i in 1:k) { 
dict1 <- x2[folds$subsets[folds$which == i], ]
dict1 <- as.dictionary(dict1 )
ext_dfm1 <- dfm_lookup(Mydfm  , dictionary = dict1)
Dictionary1 <- convert(ext_dfm1, to="data.frame")
Sentiment <- Dictionary1$positive/(Dictionary1$positive+Dictionary1$negative)
data[ , i] <- Sentiment
}
data22[j,] <- (2 * cor(data[,1],data[,2])) / (1 + cor(data[,1],data[,2]))
}

data22$name <- "nrc"
str(data22)
str(data2)
mean(data2[,1])
mean(data22[,1])

dataOK <- rbind(data2, data22)
str(dataOK)

# let's compare the performance of the two sentiment dictionaries: in our case, better the syuzhet dictionary in terms of
# internal consistency

library(ggplot2)
ggplot(dataOK , aes(x=as.factor(name), y=NA_row)) + 
    geom_boxplot(fill="slateblue", alpha=0.2) + 
    xlab("Split-Half reliability (50 random draws)") +  ylab("Split-Half reliability")