rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL") getwd() library(rtweet) library(ggplot2) library(dplyr) library(tidytext) library(readtext) library(quanteda) library(httpuv) library(maps) library(leaflet) library(lattice) library(cowplot) library(gridExtra) library(ggthemes) library(plyr) library(syuzhet) library(igraph) token <- create_token( app = "my_twitter_research_app", consumer_key = "XXXXXXXXXXXXXXXXXX", consumer_secret = "XXXXXXXXXXXXXXXXXX", access_token = "XXXXXXXXXXXXXXXXXX", access_secret = "XXXXXXXXXXXXXXXXXX") get_token() ## check to see if the token is loaded identical(token, get_token()) ###################################################################################### ###################################################################################### ############# Recover latest 3200 tweets from Donald Trump [as 6 November 2018, 8:30 am Italian time] ###################################################################################### ###################################################################################### tmls <- get_timeline(c("realDonaldTrump"), n = 3200, include_rts=TRUE) str(tmls) table(tmls $is_retweet) ######################### ## Passing your results to Quanteda ######################### table(tmls$name) print(tmls$lang[1:20]) print(tmls$text[1:20]) print(tmls$created_at[1:20]) str(tmls$created_at) tmls$date <- as.Date(tmls$created_at, "GMT") str(tmls$date) tmls$date_number <- as.numeric(tmls$date) str(tmls$date_number) tmls$date_factor <- as.factor(tmls$date) str(tmls$date_factor ) colnames(tmls) ## examine all twitter activity using weekly intervals ts_plot(tmls, "weeks") myCorpusTwitter<- corpus(tmls) summary(myCorpusTwitter) head(myCorpusTwitter) texts(myCorpusTwitter)[1:2] # number of documents ndoc(myCorpusTwitter) # inspect the document-level variables head(docvars(myCorpusTwitter)) ndoc(myCorpusTwitter) library(stm) # the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and # myDfm <- dfm(myCorpusTwitter , remove = stopwords("english"), remove_punct = TRUE, remove_numbers=TRUE, tolower = TRUE, stem = TRUE, remove_twitter = TRUE, remove_url = TRUE) topfeatures(myDfm , 20) # 20 top words # Let me see my document-term matrix for the first 10 documents and first 10 words myDfm[1:10, 1:10] myDfm.trim<- dfm_trim(myDfm , min_docfreq= 0.05) # convert the dfm from Quanteda to STM: this is a crucial step!!! # note that you have also list the document variables that are present in the corpus Dfmstm2 <- convert(myDfm.trim, to = "stm", docvars = docvars(myCorpusTwitter)) summary(warnings()) head(str(Dfmstm2)) ########################### how many K? K <-c( 5:20) storage <- searchK(Dfmstm2$documents, Dfmstm2$vocab, K, max.em.its = 500, prevalence = ~ date_number, data = Dfmstm2$meta, init.type = "Spectral") str(storage) plot(storage$results$semcoh, storage$results$exclus, xlab= "Semantic coherence", ylab= "Exclusivity", col= "blue", pch = 19, cex = 1, lty = "solid", lwd = 2) text(storage$results$semcoh, storage$results$exclus, labels=storage$results$K, cex= 1, pos=4) ########## I keep K=10 stmFitted <- stm(Dfmstm2$documents, Dfmstm2$vocab, K = 10, max.em.its = 75, prevalence = ~ date_number, data = Dfmstm2$meta, init.type = "Spectral") str(stmFitted) ################################################################ # Interpreting the STM by plotting and inspecting results ################################################################ # Summary visualization: understanding the extracted topics through words and example documents # First: which words/topic association? labelTopics(stmFitted) plot(stmFitted) # note these grahps also show you the frequency distribution of topics across the 12 documents plot(stmFitted, type = "summary", labeltype = c("frex")) plot(stmFitted, type = "hist", labeltype = c("frex")) plot(stmFitted, type = "labels", labeltype = c("frex")) str(stmFitted) str(stmFitted$theta) pca <-as.data.frame(stmFitted$theta) str(pca) barplot(apply(pca,2,mean), main="Topic Distribution", xlab="", col=c("red")) ################################################################ # Additional tools for interpretation and visualization ################################################################ # Uses a topic correlation graph estimated by topicCorr and the igraph package to plot a network where nodes are topics # and edges indicate a positive correlation. If no edges = only negative correlations! library(igraph) mod.out.corr <- topicCorr(stmFitted) str(mod.out.corr) mod.out.corr set.seed(12345) par(mar=c(0,0,0,0)) plot(mod.out.corr, vertex.color = "grey", vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.width= 3, # increase the size of the edges vertex.shape="square" ) legend("topleft", legend=c("Topic 1", "Topic 2", "Topic 3", "Topic 4", "Topic 5", "Topic 6", "Topic 7", "Topic 8", "Topic 9", "Topic 10"), cex=0.8) ######################################### # Estimating topic prevalence ######################################### prep <- estimateEffect(1:10 ~ date_number, stmFitted, meta = Dfmstm2$meta, uncertainty = "Global") plot(prep,"date_number", xaxt = "n", printlegend = FALSE, method = "continuous", topics = 1, model = stmFitted) title("Topic 1") abline(h=0, col="blue") axis.Date(side = 1, tmls$date_factor, format = "%d/%m/%Y") ######################### ## Applying dictionary ######################### # Let's apply other dictionaries! # save your results as a csv file write_as_csv(tmls, "twitter.csv", prepend_ids = TRUE, na = "", fileEncoding = "UTF-8") # and then re-open it x <- read.csv("twitter.csv") str(x) x$text <- as.character(x$text) str(x) # Another general dictionaries nrc_data <- get_nrc_sentiment(x$text, language="english") str(nrc_data) # % positive and negative tweets colSums(prop.table(nrc_data[, 9:10])) # plot % positive and negative tweets barplot( sort(colSums(prop.table(nrc_data[, 9:10]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Sentiment in Sample text", xlab="Percentage" ) barplot( sort(colSums(prop.table(nrc_data[, 1:8]))), horiz = TRUE, cex.names = 0.7, las = 1, main = "Emotions in Sample text", xlab="Percentage" ) all = c( paste(x$text[nrc_data$anger > 0], collapse=" "), paste(x$text[nrc_data$anticipation > 0], collapse=" "), paste(x$text[nrc_data$disgust > 0], collapse=" "), paste(x$text[nrc_data$fear > 0], collapse=" "), paste(x$text[nrc_data$joy > 0], collapse=" "), paste(x$text[nrc_data$sadness > 0], collapse=" "), paste(x$text[nrc_data$surprise > 0], collapse=" "), paste(x$text[nrc_data$trust > 0], collapse=" ") ) # clean the text library(tm) library(wordcloud) # function to make the text suitable for analysis clean.text = function(x) { # tolower x = tolower(x) # remove rt x = gsub("rt", "", x) # remove at x = gsub("@\\w+", "", x) # remove punctuation x = gsub("[[:punct:]]", "", x) # remove numbers x = gsub("[[:digit:]]", "", x) # remove links http x = gsub("http\\w+", "", x) # remove tabs x = gsub("[ |\t]{2,}", "", x) # remove blank spaces at the beginning x = gsub("^ ", "", x) # remove blank spaces at the end x = gsub(" $", "", x) return(x) } all = clean.text(all) # remove stop-words all = removeWords(all, c(stopwords("english"), 'Trump' ,"trump")) # create corpus corpus = Corpus(VectorSource(all)) # create term-document matrix tdm = TermDocumentMatrix(corpus) # convert as matrix tdm = as.matrix(tdm) # add column names colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust') # Plot comparison wordcloud layout(matrix(c(1, 2), nrow=2), heights=c(1, 4)) par(mar=rep(0, 4)) plot.new() text(x=0.5, y=0.5, 'Emotion Comparison Word Cloud') comparison.cloud(tdm, random.order=FALSE, colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"), title.size=1.5, max.words=250) ########################## ########################## # create the left-right economic scale ########################## ########################## dictfile <- tempfile() download.file("https://provalisresearch.com/Download/LaverGarry.zip", dictfile, mode = "wb") unzip(dictfile, exdir = (td <- tempdir())) lgdict <- dictionary(file = paste(td, "LaverGarry.cat", sep = "/")) str(lgdict) lgdict lg_dfm <- dfm(recentCorpus, dictionary = lgdict) lg_dfm x <- unlist(lgdict, recursive = FALSE) str(x) more_state <- as.data.frame(x[5]) str(more_state) colnames(more_state)[1] <- "words" more_state$words2 <- as.character(more_state$words) str(more_state) less_state <- as.data.frame(x[7]) str(less_state) colnames(less_state)[1] <- "words" less_state$words2 <- as.character(less_state$words) str(less_state) ###################################### # Eco scale ###################################### eco <- dictionary(list(less_state = less_state$words2, more_state = more_state$words2)) eco recentCorpus <- corpus_subset(data_corpus_inaugural, Year > 1991) byPresMat <- dfm(recentCorpus, dictionary = eco) byPresMat Trump_corpus<- corpus(tmls) Trump_dfm <- dfm(Trump_corpus, dictionary = lgdict) Trump_dfm # Let's focus on the categories "More State" and "Less State" Dictionary <-convert(Trump_dfm, to="data.frame") colnames(Dictionary ) names(Dictionary )[6] <- "More_State" names(Dictionary )[8] <- "Less_State" colnames(Dictionary ) Dictionary$pro_market <- Dictionary$Less_State-Dictionary$More_State table(Dictionary$pro_market) Dictionary$date <- tmls$date_factor # get daily summaries of the results (sentiments and tweets) daily <- ddply(Dictionary, ~ date, summarize, num_tweets = length(pro_market), ave_promarket = mean(pro_market)) str(daily) # correlation between the volume of discussion and sentiment cor(daily$ave_promarket, daily$num_tweets) # plot the daily sentiment vs. volume pro_market <- ggplot(daily, aes(x=date, y=ave_promarket, group=1)) + geom_line(linetype = "dashed", colour="red") + ggtitle("Pro-Market Trump") + xlab("Day") + ylab("Pro-Market") + theme_gray(base_size = 12) volume <- ggplot(daily, aes(x=date, y=num_tweets, group=1)) + geom_line() + ggtitle("Trump tweets per-day") + xlab("Day") + ylab("Volume") + theme_gray(base_size = 12) grid.arrange(pro_market, volume , ncol = 1)