rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(rtweet)
library(readtext)
library(quanteda)
token <- create_token(
app = "my_twitter_research_app",
consumer_key = "XXXXXXXXXX",
consumer_secret = "XXXXXXXXXXX",
access_token = "XXXXXXXXXXX",
access_secret = "XXXXXXXXXXXXx")
get_token()
## check to see if the token is loaded
identical(token, get_token())
rt <- search_tweets("rstats", n = 1000, include_rts = TRUE, lang = "en")
print(rt$lang[1:20])
myCorpusTwitter<- corpus(rt)
texts(myCorpusTwitter)[1:20]
# number of documents
ndoc(myCorpusTwitter)
# the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and #
# but given that I want to focus on hashtags below, I do not do that!
myDfm <- dfm(myCorpusTwitter , remove_punct = TRUE)
##################################################
# Extract most common hashtags and plot their network
##################################################
tag_dfm <- dfm_select(myDfm, ('#*'))
tag_dfm[1:4, 1:10]
# Now I extract the top 50 hashtags
toptag <- names(topfeatures(tag_dfm, 50))
head(toptag)
# Construct feature-occurrence matrix of hashtags (measuring co-occurrences of features within a user-defined context)
# how to count co-occurrences:
# "boolean": counts only the co-occurrence or not within the context, irrespective of how many times it occurs.
# "frequency" (the default): counts the number of co-occurrences within the context
# to understand it, look at this example:
txts <- c("a a a b b c", "a a c e", "a c e f g")
txts
fcm(txts, context = "document", count = "boolean")
fcm(txts, context = "document", count = "frequency")
tag_fcm <- fcm(tag_dfm)
head(tag_fcm)
# keeps only the toptag hashtag
topgat_fcm <- fcm_select(tag_fcm, toptag)
head(topgat_fcm)
str(topgat_fcm)
pdf("testplot.pdf", width = 10, height = 8)
textplot_network(topgat_fcm, min_freq = 0.7, edge_alpha = 0.2, edge_size = 5)
dev.off()
# if you are using RStudio, rather than the old-Skool R GUI as myself,
# then the graph should appear to you also by simply typing
textplot_network(topgat_fcm, min_freq = 0.7, edge_alpha = 0.2, edge_size = 5)
########################################################
## let's save our result as a igraph object
########################################################
igraph <- as.igraph(topgat_fcm, min_freq = 0.7)
igraph
library(hrbrthemes)
library(ggraph)
library(tidyverse)
library(igraph)
vcount(igraph) # number of vertices (nodes)
V(igraph) # nodes
V(igraph)$name # names of each node
ecount(igraph) # number of edges
E(igraph) # edges
par(mar=c(0,0,0,0))
plot(igraph, vertex.color = "grey", # change color of nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 0.1, # increase the size of the edges
vertex.shape="square" ) # change the shape of the nodes as a square
igraph
strength(igraph)
strength(igraph, mode="out")
strength(igraph, mode="in")
V(igraph)$frequency <- strength(igraph)
strength(igraph)
# Now we’re only going to show the labels of hashtags that gets more tha 2000 mentions
V(igraph)$label <- ifelse( strength(igraph)>=2000, V(igraph)$name, NA )
par(mar=c(0,0,0,0))
plot(igraph, vertex.color = "grey", # change color of nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 0.1, # increase the size of the edges
vertex.shape="square" ) # change the shape of the nodes as a square
# descriptive statistics
sort(degree(igraph))
sort(betweenness(igraph))
# density larger than 0? yes when you have loops or multiple edges
edge_density(igraph)
ecount(igraph)/(vcount(igraph)*(vcount(igraph)-1)) #for a directed network
edge_density(simplify(igraph))
edge_density(simplify(igraph, remove.loops=TRUE, remove.multiple=TRUE))
cluster_walktrap(igraph)
comm <- cluster_walktrap(igraph)
modularity(comm) # modularity score
length(comm ) # number of communities
membership(comm ) # community membership for each node
par(mar=c(0,0,0,0))
plot(comm, igraph)
plot(comm, igraph, vertex.color = "grey", # change color of nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 0.1, # increase the size of the edges
vertex.shape="square" ) # change the shape of the nodes as a square
V(igraph)$color <- membership(comm)
par(mar=c(0,0,0,0))
plot( igraph, vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 0.1, # increase the size of the edges
vertex.shape="square" ) # change the shape of the nodes as a square
# create a sub-network composed by ONLY the nodes in subv and the edges between them
igraph <- as.igraph(topgat_fcm, min_freq = 0.7)
V(igraph)$name # names of each node
subv <- c("#bigdata", "#datascientist")
par(mfrow=c(1, 2), mar=c(0,0,0,0))
set.seed(111)
plot( igraph, vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 0.1, # increase the size of the edges
vertex.shape="square" ) # change the shape of the nodes as a square
set.seed(111)
plot(induced.subgraph(graph=igraph,vids=subv),
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 0.1, # increase the size of the edges
vertex.shape="square" )
# create a sub-network composed by the nodes in subv and all their first degree neighbors
plot(induced.subgraph(graph=igraph,vids=unlist(neighborhood(graph=igraph,order=1,nodes=subv))),
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 0.1, # increase the size of the edges
vertex.shape="square" )
# create a sub-network composed by the nodes in subv and, if some of them is
# connected to other nodes (even if not in subv), take also them
# (and of course include all the edges among this bunch of nodes).
sg1 <- decompose.graph(igraph,mode="weak")
neighverts <- unique(unlist(sapply(sg1,FUN=function(s){if(any(V(s)$name %in% subv)) V(s)$name else NULL})))
neighverts
membership(comm)
g3 <- induced.subgraph(graph=igraph,vids=neighverts)
plot(g3)
plot(induced.subgraph(graph=igraph,vids=neighverts),
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 0.1, # increase the size of the edges
vertex.shape="square" )
##################################################
# Extract most frequently mentioned usernames in a tweet and plot their network
##################################################
myDfm <- dfm(myCorpusTwitter , remove_punct = TRUE)
user_dfm <- dfm_select(myDfm, ('@*'))
topuser <- names(topfeatures(user_dfm, 50))
head(topuser)
# Construct feature-occurrence matrix of usernames
user_fcm <- fcm(user_dfm)
head(user_fcm)
user_fcm <- fcm_select(user_fcm, topuser)
pdf("user.pdf", width = 10, height = 8)
textplot_network(user_fcm, min_freq = 0.1, edge_color = 'orange', edge_alpha = 0.8, edge_size = 5)
dev.off()
pdf("user2.pdf", width = 10, height = 8)
textplot_network(user_fcm, min_freq = 0.1, edge_color = 'orange', edge_alpha = 0.8, edge_size = 5, omit_isolated = FALSE)
dev.off()
# and from here, once again, you can pass the file to igraph and then do the usual stuff...
##################################################
# Using other packages
##################################################
##################################################
#Building the retweet network
##################################################
rt2 <- rt
# if a tweet is a retweet or not
table(rt2$is_retweet)
# number of times a tweet has been retweeted
table(rt2$retweet_count)
head(print(rt2[c("text", "is_retweet")]))
# user name of the person tweeting (or retweeting) something in our sample
rt2$screen_name
table(rt2$screen_name)
# retweet_screen_name captures the name of the retweeted person
rt2$retweet_screen_name
# I want to focus only on the retweets
retweets <- filter(rt2, is_retweet =="TRUE")
table(retweets $is_retweet)
el2 <- as.data.frame(cbind(sender = tolower(retweets $retweet_screen_name), receiver = tolower(retweets $screen_name)))
str(el2)
el2[1:5,] #show the first 5 edges in the edgelist
el2 = count(el2, sender, receiver)
str(el2)
table(el2$n)
el2[1:5,] #show the first 5 edges in the edgelist
igraph2 <- graph_from_data_frame(d = el2, directed = TRUE)
igraph2
vcount(igraph2) # number of vertices (nodes)
V(igraph2) # nodes
V(igraph2)$name # names of each node
ecount(igraph2) # number of edges
E(igraph2) # edges
plot(igraph2)
par(mar=c(0,0,0,0))
plot(igraph2, vertex.color = "white", # change color of nodes
vertex.size=1, # reduce the size of the nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="red", # change edge color to red
edge.arrow.size = 0.4, # increase the size of the edges
vertex.shape="circle" ) # change the shape of the nodes as a square
igraph2
strength(igraph2)
table(strength(igraph2))
V(igraph2)$frequency <- strength(igraph2)
strength(igraph2)
# Now we’re only going to show the users that gets more than 15 mentions
V(igraph2)$label <- ifelse( strength(igraph2)>=15, V(igraph2)$name, NA )
par(mar=c(0,0,0,0))
plot(igraph, vertex.color = "white", # change color of nodes
vertex.size=1, # reduce the size of the nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = 1, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="red", # change edge color to red
edge.arrow.size = 0.4, # increase the size of the edges
vertex.shape="circle" ) # change the shape of the nodes as a square
# descriptive statistics
sort(degree(igraph2))
sort(betweenness(igraph2))
edge_density(igraph2)
cluster_walktrap(igraph2)
comm <- cluster_walktrap(igraph2)
modularity(comm) # modularity score
length(comm ) # number of communities
membership(comm ) # community membership for each node
par(mar=c(0,0,0,0))
plot(comm, igraph2)
plot(comm, igraph2, vertex.color = "white", # change color of nodes
vertex.size=1, # reduce the size of the nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="red", # change edge color to red
edge.arrow.size = 0.4, # increase the size of the edges
vertex.shape="circle" ) # change the shape of the nodes as a square
V(igraph2)$color <- membership(comm)
par(mar=c(0,0,0,0))
plot( igraph2, vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="red", # change edge color to grey
edge.arrow.size = 0.4, # increase the size of the edges
vertex.shape="circle" ) # change the shape of the nodes as a square
### another possible graph representation using ggraph
# To help de-clutter the vertex labels, we’ll only add labels for nodes that have a degree of 15 or more
# (rough guess — you should look at the degree distribution for more formal work).
# We’ll also include the degree for those nodes so we can size them properly:
igraph2 <- graph_from_data_frame(d = el2, directed = TRUE)
V(igraph2)$node_label <- unname(ifelse(degree(igraph2)[V(igraph2)] > 15, names(V(igraph2)), ""))
V(igraph2)$node_size <- unname(ifelse(degree(igraph2)[V(igraph2)] > 15, degree(igraph2), 0))
pdf("retweet_network.pdf", width = 10, height = 8)
ggraph(igraph2, layout = 'linear', circular = TRUE) +
geom_edge_arc(edge_width=0.125, aes(alpha=..index..)) +
geom_node_label(aes(label=node_label, size=node_size),
label.size=0, fill="#ffffff66", segment.colour="springgreen",
color="slateblue", repel=TRUE, family=font_rc, fontface="bold") +
coord_fixed() +
scale_size_area(trans="sqrt") +
labs(title="Retweet Relationships", subtitle="Most retweeted screen names labeled. Darkers edges == more retweets. Node size == larger degree") +
theme_graph(base_family=font_rc) +
theme(legend.position="none")
dev.off()