rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL") getwd() library(rtweet) library(readtext) library(quanteda) token <- create_token( app = "my_twitter_research_app", consumer_key = "XXXXXXXXXX", consumer_secret = "XXXXXXXXXXX", access_token = "XXXXXXXXXXX", access_secret = "XXXXXXXXXXXXx") get_token() ## check to see if the token is loaded identical(token, get_token()) rt <- search_tweets("rstats", n = 1000, include_rts = TRUE, lang = "en") print(rt$lang[1:20]) myCorpusTwitter<- corpus(rt) texts(myCorpusTwitter)[1:20] # number of documents ndoc(myCorpusTwitter) # the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and # # but given that I want to focus on hashtags below, I do not do that! myDfm <- dfm(myCorpusTwitter , remove_punct = TRUE) ################################################## # Extract most common hashtags and plot their network ################################################## tag_dfm <- dfm_select(myDfm, ('#*')) tag_dfm[1:4, 1:10] # Now I extract the top 50 hashtags toptag <- names(topfeatures(tag_dfm, 50)) head(toptag) # Construct feature-occurrence matrix of hashtags (measuring co-occurrences of features within a user-defined context) # how to count co-occurrences: # "boolean": counts only the co-occurrence or not within the context, irrespective of how many times it occurs. # "frequency" (the default): counts the number of co-occurrences within the context # to understand it, look at this example: txts <- c("a a a b b c", "a a c e", "a c e f g") txts fcm(txts, context = "document", count = "boolean") fcm(txts, context = "document", count = "frequency") tag_fcm <- fcm(tag_dfm) head(tag_fcm) # keeps only the toptag hashtag topgat_fcm <- fcm_select(tag_fcm, toptag) head(topgat_fcm) str(topgat_fcm) pdf("testplot.pdf", width = 10, height = 8) textplot_network(topgat_fcm, min_freq = 0.7, edge_alpha = 0.2, edge_size = 5) dev.off() # if you are using RStudio, rather than the old-Skool R GUI as myself, # then the graph should appear to you also by simply typing textplot_network(topgat_fcm, min_freq = 0.7, edge_alpha = 0.2, edge_size = 5) ######################################################## ## let's save our result as a igraph object ######################################################## igraph <- as.igraph(topgat_fcm, min_freq = 0.7) igraph library(hrbrthemes) library(ggraph) library(tidyverse) library(igraph) vcount(igraph) # number of vertices (nodes) V(igraph) # nodes V(igraph)$name # names of each node ecount(igraph) # number of edges E(igraph) # edges par(mar=c(0,0,0,0)) plot(igraph, vertex.color = "grey", # change color of nodes vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 0.1, # increase the size of the edges vertex.shape="square" ) # change the shape of the nodes as a square igraph strength(igraph) strength(igraph, mode="out") strength(igraph, mode="in") V(igraph)$frequency <- strength(igraph) strength(igraph) # Now we’re only going to show the labels of hashtags that gets more tha 2000 mentions V(igraph)$label <- ifelse( strength(igraph)>=2000, V(igraph)$name, NA ) par(mar=c(0,0,0,0)) plot(igraph, vertex.color = "grey", # change color of nodes vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 0.1, # increase the size of the edges vertex.shape="square" ) # change the shape of the nodes as a square # descriptive statistics sort(degree(igraph)) sort(betweenness(igraph)) # density larger than 0? yes when you have loops or multiple edges edge_density(igraph) ecount(igraph)/(vcount(igraph)*(vcount(igraph)-1)) #for a directed network edge_density(simplify(igraph)) edge_density(simplify(igraph, remove.loops=TRUE, remove.multiple=TRUE)) cluster_walktrap(igraph) comm <- cluster_walktrap(igraph) modularity(comm) # modularity score length(comm ) # number of communities membership(comm ) # community membership for each node par(mar=c(0,0,0,0)) plot(comm, igraph) plot(comm, igraph, vertex.color = "grey", # change color of nodes vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 0.1, # increase the size of the edges vertex.shape="square" ) # change the shape of the nodes as a square V(igraph)$color <- membership(comm) par(mar=c(0,0,0,0)) plot( igraph, vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 0.1, # increase the size of the edges vertex.shape="square" ) # change the shape of the nodes as a square # create a sub-network composed by ONLY the nodes in subv and the edges between them igraph <- as.igraph(topgat_fcm, min_freq = 0.7) V(igraph)$name # names of each node subv <- c("#bigdata", "#datascientist") par(mfrow=c(1, 2), mar=c(0,0,0,0)) set.seed(111) plot( igraph, vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 0.1, # increase the size of the edges vertex.shape="square" ) # change the shape of the nodes as a square set.seed(111) plot(induced.subgraph(graph=igraph,vids=subv), vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 0.1, # increase the size of the edges vertex.shape="square" ) # create a sub-network composed by the nodes in subv and all their first degree neighbors plot(induced.subgraph(graph=igraph,vids=unlist(neighborhood(graph=igraph,order=1,nodes=subv))), vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 0.1, # increase the size of the edges vertex.shape="square" ) # create a sub-network composed by the nodes in subv and, if some of them is # connected to other nodes (even if not in subv), take also them # (and of course include all the edges among this bunch of nodes). sg1 <- decompose.graph(igraph,mode="weak") neighverts <- unique(unlist(sapply(sg1,FUN=function(s){if(any(V(s)$name %in% subv)) V(s)$name else NULL}))) neighverts membership(comm) g3 <- induced.subgraph(graph=igraph,vids=neighverts) plot(g3) plot(induced.subgraph(graph=igraph,vids=neighverts), vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 0.1, # increase the size of the edges vertex.shape="square" ) ################################################## # Extract most frequently mentioned usernames in a tweet and plot their network ################################################## myDfm <- dfm(myCorpusTwitter , remove_punct = TRUE) user_dfm <- dfm_select(myDfm, ('@*')) topuser <- names(topfeatures(user_dfm, 50)) head(topuser) # Construct feature-occurrence matrix of usernames user_fcm <- fcm(user_dfm) head(user_fcm) user_fcm <- fcm_select(user_fcm, topuser) pdf("user.pdf", width = 10, height = 8) textplot_network(user_fcm, min_freq = 0.1, edge_color = 'orange', edge_alpha = 0.8, edge_size = 5) dev.off() pdf("user2.pdf", width = 10, height = 8) textplot_network(user_fcm, min_freq = 0.1, edge_color = 'orange', edge_alpha = 0.8, edge_size = 5, omit_isolated = FALSE) dev.off() # and from here, once again, you can pass the file to igraph and then do the usual stuff... ################################################## # Using other packages ################################################## ################################################## #Building the retweet network ################################################## rt2 <- rt # if a tweet is a retweet or not table(rt2$is_retweet) # number of times a tweet has been retweeted table(rt2$retweet_count) head(print(rt2[c("text", "is_retweet")])) # user name of the person tweeting (or retweeting) something in our sample rt2$screen_name table(rt2$screen_name) # retweet_screen_name captures the name of the retweeted person rt2$retweet_screen_name # I want to focus only on the retweets retweets <- filter(rt2, is_retweet =="TRUE") table(retweets $is_retweet) el2 <- as.data.frame(cbind(sender = tolower(retweets $retweet_screen_name), receiver = tolower(retweets $screen_name))) str(el2) el2[1:5,] #show the first 5 edges in the edgelist el2 = count(el2, sender, receiver) str(el2) table(el2$n) el2[1:5,] #show the first 5 edges in the edgelist igraph2 <- graph_from_data_frame(d = el2, directed = TRUE) igraph2 vcount(igraph2) # number of vertices (nodes) V(igraph2) # nodes V(igraph2)$name # names of each node ecount(igraph2) # number of edges E(igraph2) # edges plot(igraph2) par(mar=c(0,0,0,0)) plot(igraph2, vertex.color = "white", # change color of nodes vertex.size=1, # reduce the size of the nodes vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="red", # change edge color to red edge.arrow.size = 0.4, # increase the size of the edges vertex.shape="circle" ) # change the shape of the nodes as a square igraph2 strength(igraph2) table(strength(igraph2)) V(igraph2)$frequency <- strength(igraph2) strength(igraph2) # Now we’re only going to show the users that gets more than 15 mentions V(igraph2)$label <- ifelse( strength(igraph2)>=15, V(igraph2)$name, NA ) par(mar=c(0,0,0,0)) plot(igraph, vertex.color = "white", # change color of nodes vertex.size=1, # reduce the size of the nodes vertex.label.color = "black", # change color of labels vertex.label.cex = 1, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="red", # change edge color to red edge.arrow.size = 0.4, # increase the size of the edges vertex.shape="circle" ) # change the shape of the nodes as a square # descriptive statistics sort(degree(igraph2)) sort(betweenness(igraph2)) edge_density(igraph2) cluster_walktrap(igraph2) comm <- cluster_walktrap(igraph2) modularity(comm) # modularity score length(comm ) # number of communities membership(comm ) # community membership for each node par(mar=c(0,0,0,0)) plot(comm, igraph2) plot(comm, igraph2, vertex.color = "white", # change color of nodes vertex.size=1, # reduce the size of the nodes vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="red", # change edge color to red edge.arrow.size = 0.4, # increase the size of the edges vertex.shape="circle" ) # change the shape of the nodes as a square V(igraph2)$color <- membership(comm) par(mar=c(0,0,0,0)) plot( igraph2, vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="red", # change edge color to grey edge.arrow.size = 0.4, # increase the size of the edges vertex.shape="circle" ) # change the shape of the nodes as a square ### another possible graph representation using ggraph # To help de-clutter the vertex labels, we’ll only add labels for nodes that have a degree of 15 or more # (rough guess — you should look at the degree distribution for more formal work). # We’ll also include the degree for those nodes so we can size them properly: igraph2 <- graph_from_data_frame(d = el2, directed = TRUE) V(igraph2)$node_label <- unname(ifelse(degree(igraph2)[V(igraph2)] > 15, names(V(igraph2)), "")) V(igraph2)$node_size <- unname(ifelse(degree(igraph2)[V(igraph2)] > 15, degree(igraph2), 0)) pdf("retweet_network.pdf", width = 10, height = 8) ggraph(igraph2, layout = 'linear', circular = TRUE) + geom_edge_arc(edge_width=0.125, aes(alpha=..index..)) + geom_node_label(aes(label=node_label, size=node_size), label.size=0, fill="#ffffff66", segment.colour="springgreen", color="slateblue", repel=TRUE, family=font_rc, fontface="bold") + coord_fixed() + scale_size_area(trans="sqrt") + labs(title="Retweet Relationships", subtitle="Most retweeted screen names labeled. Darkers edges == more retweets. Node size == larger degree") + theme_graph(base_family=font_rc) + theme(legend.position="none") dev.off()