rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (VOICES)/TOPIC MODEL")
getwd()
library(rtweet)
library(readtext)
library(quanteda)


token <- create_token(
  app = "my_twitter_research_app",
  consumer_key = "XXXXXXXXXX",
  consumer_secret = "XXXXXXXXXXX",
  access_token = "XXXXXXXXXXX",
  access_secret = "XXXXXXXXXXXXx")

get_token()

## check to see if the token is loaded
identical(token, get_token())

rt <- search_tweets("rstats", n = 1000, include_rts = TRUE,  lang = "en")
print(rt$lang[1:20])
myCorpusTwitter<- corpus(rt)
texts(myCorpusTwitter)[1:20]
# number of documents
ndoc(myCorpusTwitter)

# the remove_twitter = TRUE implies that when I do a dfm I remove Twitter characters @ and #
# but given that I want to focus on hashtags below, I do not do that!
myDfm <- dfm(myCorpusTwitter , remove_punct = TRUE)

##################################################
# Extract most common hashtags and plot their network
##################################################

tag_dfm <- dfm_select(myDfm, ('#*'))
tag_dfm[1:4, 1:10]
# Now I extract the top 50 hashtags
toptag <- names(topfeatures(tag_dfm, 50))
head(toptag)

# Construct feature-occurrence matrix of hashtags (measuring co-occurrences of features within a user-defined context)
# how to count co-occurrences:
# "boolean": counts only the co-occurrence or not within the context, irrespective of how many times it occurs.
# "frequency" (the default): counts the number of co-occurrences within the context

# to understand it, look at this example:
txts <- c("a a a b b c", "a a c e", "a c e f g")
txts 
fcm(txts, context = "document", count = "boolean")
fcm(txts, context = "document", count = "frequency")

tag_fcm <- fcm(tag_dfm)
head(tag_fcm)

# keeps only the toptag hashtag
topgat_fcm <- fcm_select(tag_fcm, toptag)
head(topgat_fcm)
str(topgat_fcm)

pdf("testplot.pdf", width = 10, height = 8)
textplot_network(topgat_fcm,  min_freq = 0.7, edge_alpha = 0.2, edge_size = 5)
dev.off()

# if you are using RStudio, rather than the old-Skool R GUI as myself,
# then the graph should appear to you also by simply typing 
textplot_network(topgat_fcm,  min_freq = 0.7, edge_alpha = 0.2, edge_size = 5)

########################################################
## let's save our result as a igraph object 
########################################################

igraph <- as.igraph(topgat_fcm,  min_freq = 0.7)
igraph

library(hrbrthemes)
library(ggraph)
library(tidyverse)
library(igraph)

vcount(igraph) # number of vertices (nodes)
V(igraph) # nodes
V(igraph)$name # names of each node
ecount(igraph) # number of edges
E(igraph) # edges

par(mar=c(0,0,0,0))
plot(igraph,  vertex.color = "grey", # change color of nodes
     vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.arrow.size = 0.1, # increase the size of the edges
    vertex.shape="square" ) # change the shape of the nodes as a square

igraph
strength(igraph)
strength(igraph, mode="out")
strength(igraph, mode="in")

V(igraph)$frequency <- strength(igraph)
strength(igraph)

# Now we’re only going to show the labels of hashtags that gets more tha 2000 mentions

V(igraph)$label <- ifelse( strength(igraph)>=2000, V(igraph)$name, NA )
par(mar=c(0,0,0,0))
plot(igraph,  vertex.color = "grey", # change color of nodes
     vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.arrow.size = 0.1, # increase the size of the edges
    vertex.shape="square" ) # change the shape of the nodes as a square

# descriptive statistics
sort(degree(igraph))
sort(betweenness(igraph))
# density larger than 0? yes when you have loops or multiple edges
edge_density(igraph)
ecount(igraph)/(vcount(igraph)*(vcount(igraph)-1)) #for a directed network
edge_density(simplify(igraph))
edge_density(simplify(igraph, remove.loops=TRUE, remove.multiple=TRUE))

cluster_walktrap(igraph)
comm <- cluster_walktrap(igraph)
modularity(comm) # modularity score

length(comm )     # number of communities
membership(comm ) # community membership for each node

par(mar=c(0,0,0,0))
plot(comm, igraph)
plot(comm, igraph,  vertex.color = "grey", # change color of nodes
     vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.arrow.size = 0.1, # increase the size of the edges
    vertex.shape="square" ) # change the shape of the nodes as a square

V(igraph)$color <- membership(comm)
par(mar=c(0,0,0,0))
plot( igraph,       vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.arrow.size = 0.1, # increase the size of the edges
    vertex.shape="square" ) # change the shape of the nodes as a square

# create a sub-network composed by ONLY the nodes in subv and the edges between them
igraph <- as.igraph(topgat_fcm,  min_freq = 0.7)
V(igraph)$name # names of each node
subv <- c("#bigdata", "#datascientist")
par(mfrow=c(1, 2), mar=c(0,0,0,0))
set.seed(111)
plot( igraph,       vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.arrow.size = 0.1, # increase the size of the edges
    vertex.shape="square" ) # change the shape of the nodes as a square
set.seed(111)
plot(induced.subgraph(graph=igraph,vids=subv),
 vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.arrow.size = 0.1, # increase the size of the edges
    vertex.shape="square" )

# create a sub-network composed by the nodes in subv and all their first degree neighbors 
plot(induced.subgraph(graph=igraph,vids=unlist(neighborhood(graph=igraph,order=1,nodes=subv))),
 vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.arrow.size = 0.1, # increase the size of the edges
    vertex.shape="square" )

# create a sub-network composed by the nodes in subv and, if some of them is
# connected to other nodes (even if not in subv), take also them 
# (and of course include all the edges among this bunch of nodes). 

sg1 <- decompose.graph(igraph,mode="weak")
neighverts <- unique(unlist(sapply(sg1,FUN=function(s){if(any(V(s)$name %in% subv)) V(s)$name else NULL})))
neighverts
membership(comm)
g3 <- induced.subgraph(graph=igraph,vids=neighverts)
plot(g3)
plot(induced.subgraph(graph=igraph,vids=neighverts),
 vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="grey20", # change edge color to grey
    edge.arrow.size = 0.1, # increase the size of the edges
    vertex.shape="square" )

##################################################
# Extract most frequently mentioned usernames in a tweet and plot their network
##################################################

myDfm <- dfm(myCorpusTwitter , remove_punct = TRUE)
user_dfm <- dfm_select(myDfm, ('@*'))
topuser <- names(topfeatures(user_dfm, 50))
head(topuser)

# Construct feature-occurrence matrix of usernames
user_fcm <- fcm(user_dfm)
head(user_fcm)

user_fcm <- fcm_select(user_fcm, topuser)
pdf("user.pdf", width = 10, height = 8)
textplot_network(user_fcm, min_freq = 0.1, edge_color = 'orange', edge_alpha = 0.8, edge_size = 5)
dev.off()

pdf("user2.pdf", width = 10, height = 8)
textplot_network(user_fcm, min_freq = 0.1, edge_color = 'orange', edge_alpha = 0.8, edge_size = 5, omit_isolated = FALSE)
dev.off()

# and from here, once again, you can pass the file to igraph and then do the usual stuff...

##################################################
# Using other packages
##################################################

##################################################
#Building the retweet network
##################################################

rt2 <- rt
# if a tweet is a retweet or not
table(rt2$is_retweet)
# number of times a tweet has been retweeted
table(rt2$retweet_count)
head(print(rt2[c("text", "is_retweet")]))

# user name of the person tweeting (or retweeting) something in our sample
rt2$screen_name
table(rt2$screen_name)

# retweet_screen_name captures the name of the retweeted person
rt2$retweet_screen_name

# I want to focus only on the retweets
retweets <- filter(rt2, is_retweet =="TRUE")
table(retweets $is_retweet)

el2 <- as.data.frame(cbind(sender = tolower(retweets $retweet_screen_name), receiver = tolower(retweets $screen_name)))
str(el2)
el2[1:5,] #show the first 5 edges in the edgelist
el2 = count(el2, sender, receiver) 
str(el2)
table(el2$n)
el2[1:5,] #show the first 5 edges in the edgelist

igraph2 <- graph_from_data_frame(d = el2, directed = TRUE)
igraph2
vcount(igraph2) # number of vertices (nodes)
V(igraph2) # nodes
V(igraph2)$name # names of each node
ecount(igraph2) # number of edges
E(igraph2) # edges

plot(igraph2)

par(mar=c(0,0,0,0))
plot(igraph2,  vertex.color = "white", # change color of nodes
     vertex.size=1, # reduce the size of the nodes
    vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="red", # change edge color to red
    edge.arrow.size = 0.4, # increase the size of the edges
    vertex.shape="circle" ) # change the shape of the nodes as a square

igraph2
strength(igraph2)
table(strength(igraph2))
V(igraph2)$frequency <- strength(igraph2)
strength(igraph2)

# Now we’re only going to show the users that gets more than 15 mentions

V(igraph2)$label <- ifelse( strength(igraph2)>=15, V(igraph2)$name, NA )
par(mar=c(0,0,0,0))
plot(igraph,  vertex.color = "white", # change color of nodes
     vertex.size=1, # reduce the size of the nodes
    vertex.label.color = "black", # change color of labels
     vertex.label.cex = 1, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="red", # change edge color to red
    edge.arrow.size = 0.4, # increase the size of the edges
    vertex.shape="circle" ) # change the shape of the nodes as a square

# descriptive statistics
sort(degree(igraph2))
sort(betweenness(igraph2))
edge_density(igraph2)

cluster_walktrap(igraph2)
comm <- cluster_walktrap(igraph2)
modularity(comm) # modularity score
length(comm )     # number of communities
membership(comm ) # community membership for each node

par(mar=c(0,0,0,0))
plot(comm, igraph2)
plot(comm, igraph2,  vertex.color = "white", # change color of nodes
     vertex.size=1, # reduce the size of the nodes
    vertex.label.color = "black", # change color of labels
     vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="red", # change edge color to red
    edge.arrow.size = 0.4, # increase the size of the edges
    vertex.shape="circle" ) # change the shape of the nodes as a square

V(igraph2)$color <- membership(comm)
par(mar=c(0,0,0,0))
plot( igraph2,       vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
     edge.curved=.25, # add a 25% curve to the edges
     edge.color="red", # change edge color to grey
    edge.arrow.size = 0.4, # increase the size of the edges
    vertex.shape="circle" ) # change the shape of the nodes as a square

### another possible graph representation using ggraph

# To help de-clutter the vertex labels, we’ll only add labels for nodes that have a degree of 15 or more 
# (rough guess — you should look at the degree distribution for more formal work). 
# We’ll also include the degree for those nodes so we can size them properly:

igraph2 <- graph_from_data_frame(d = el2, directed = TRUE)
V(igraph2)$node_label <- unname(ifelse(degree(igraph2)[V(igraph2)] > 15, names(V(igraph2)), "")) 
V(igraph2)$node_size <- unname(ifelse(degree(igraph2)[V(igraph2)] > 15, degree(igraph2), 0)) 

pdf("retweet_network.pdf", width = 10, height = 8)
ggraph(igraph2, layout = 'linear', circular = TRUE) + 
  geom_edge_arc(edge_width=0.125, aes(alpha=..index..)) +
  geom_node_label(aes(label=node_label, size=node_size),
                  label.size=0, fill="#ffffff66", segment.colour="springgreen",
                  color="slateblue", repel=TRUE, family=font_rc, fontface="bold") +
  coord_fixed() +
  scale_size_area(trans="sqrt") +
  labs(title="Retweet Relationships", subtitle="Most retweeted screen names labeled. Darkers edges == more retweets. Node size == larger degree") +
  theme_graph(base_family=font_rc) +
  theme(legend.position="none")
dev.off()