rm(list=ls(all=TRUE))
getwd()
setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL")
getwd()
library(tidyverse)
library(dplyr)
library(igraph)
library(hrbrthemes)
library(ggraph)
library(tidyverse)
letters <- read.csv("network_ex.csv")
letters
str(letters)
# change the name of the columns and converting everything to lower case (not needed of course...)
el2 <- as.data.frame(cbind(sender = tolower(letters$source), receiver = tolower(letters$destination)))
str(el2)
# number of times that sender and receiver appears together (that is, the number of times a letter goest from A to B)
el2 <- count(el2, sender, receiver)
str(el2)
colnames(el2)[3] <- "weight"
table(el2$weight)
el2[1:5,] #show the first 5 edges in the edgelist
print(el2[c("sender", "receiver", "weight")])
# let's pass the matrix to igraph to create our network
# Note that if the network is unidirected, then it doesn't matter which vertex/node is in the 'V1' (i.e., first variable name) versus 'V2' columns.
# However, if the network is directed, then the convention is that the edge goes FROM V1 TO V2.
# here note that directed=TRUE; otherwise write: directed=FALSE
# Remember that if your vertex/node IDs are numbers, it is better to designate them as characters (or factors)
# or else igraph can get confused
igraph <- graph_from_data_frame(el2, directed = TRUE)
igraph
# What does it mean? - D means directed (otherwise you would have read U for unidirected)
# N means named graph
# W means weighted graph
# 7 is the number of nodes
# 18 is the number of edges
# name (v/c) means name is a node attribute and it’s a character
# weight (e/n) means weight is an edge attribute and it’s numeric (you can also add other edge attributes if you want as separated columns!)
# This is how you access specific elements within the igraph object:
vcount(igraph) # number of vertices (nodes)
V(igraph) # nodes
V(igraph)$name # names of each node
vertex_attr(igraph) # all attributes of the nodes
ecount(igraph) # number of edges
E(igraph) # edges
E(igraph)$weight # weights for each edge
# getting the weight for each of the edges in the list of edges that originate from node number 5
E(igraph) [ from(5) ]$weight
# getting the weight for each of the edges in the list of edges that end onto node number 3
E(igraph) [ to(3) ]$weight
# getting the weight of the edge that simple connects just node 5 and node 3
E(igraph)[5 %--% 3]$weight
edge_attr(igraph) # all attributes of the edges
# you can also create the corresponding adjacency matrix
igraph[] # adjacency matrix; note: 3 letters from antwerp to lisse, but just 1 from lisse to antwerp
igraph[1,] # first row of adjacency matrix
####################################################################
# Network visualization
####################################################################
# mar is A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin
# to be specified on the four sides of the plot
par(mar=c(0,0,0,0))
plot(igraph) # (look at the arrows! if you have a directional network, the arrows are on both sides!)
par(mar=c(0,0,0,0))
plot(igraph, vertex.color = "grey", # change color of nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 1, # increase the size of the edges
vertex.shape="square" ) # change the shape of the nodes as a square
# Now imagine that we want to modify some of these plotting attributes so that they are function of network properties
# For example, we can change the width of the edges according to their weight
par(mar=c(0,0,0,0))
plot(igraph, vertex.color = "grey", # change color of nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 1, # increase the size of the edges
vertex.shape="square", # change the shape of the nodes as a square
edge.width=E(igraph)$weight) # add the weight to the edges
par(mar=c(0,0,0,0))
plot(igraph, vertex.color = "grey", # change color of nodes
vertex.label.color = "black", # change color of labels
vertex.label.cex = .75, # change size of labels to 75% of original size
edge.curved=.25, # add a 25% curve to the edges
edge.color="grey20", # change edge color to grey
edge.arrow.size = 1, # increase the size of the edges
vertex.shape="square", # change the shape of the nodes as a square
edge.width=E(igraph)$weight, # add the weight to the edges
edge.label = el2$weight, # show the label of the weights along the edges [take a look now at the relationship between antwerp to lisse]
edge.label.font=2, # font of the label (2=bold)
edge.label.cex=2,
edge.label.color="red") # colour of the label
# We can also change the colors of each node based on some of their characteristics
# create vectors with characters in each side
holland <- c("antwerp", "lisse", "haarlem")
germany <- c("bremen", "hamburg")
other <- c("venice", "delft ")
# node we'll create a new color variable as a node property
V(igraph)$color <- NA
V(igraph)$color[V(igraph)$name %in% holland] <- "red"
V(igraph)$color[V(igraph)$name %in% germany] <- "gold"
V(igraph)$color[V(igraph)$name %in% other] <- "grey"
vertex_attr(igraph)
par(mar=c(0,0,0,0))
plot(igraph)
# If we want to indicate what the colors correspond to, we can add a legend.
par(mar=c(0,0,0,0))
plot(igraph)
legend(x=.75, y=.75, legend=c("Holland", "Germany", "Other"),
pch=21, pt.bg=c("red", "gold", "grey"), pt.cex=2, bty="n")
# Another common adjustment is to change the size of the nodes and node labels so that they match their weight
igraph[]
strength(igraph)
strength(igraph, mode="out")
strength(igraph, mode="in")
V(igraph)$size <- strength(igraph)
par(mar=c(0,0,0,0))
plot(igraph)
# Now we’re only going to show the labels of character that gets more tha 9 letters
V(igraph)$label <- ifelse( strength(igraph)>=10, V(igraph)$name, NA )
par(mar=c(0,0,0,0))
plot(igraph)
# Up to now, each time we run the plot function, the nodes appear to be in a different location.
# Why? Because it’s running a probabilistic function trying to locate them in the optimal way possible.
# However, we can also specify the layout for the plot; that is, the (x,y) coordinates where each node will be placed.
# igraph has a few different layouts built-in, that will use different algorithms to find an optimal distribution of nodes.
# The following code illustrates some of these:
par(mfrow=c(2, 3), mar=c(0,0,1,0))
plot(igraph, layout=layout_randomly, main="Random")
plot(igraph, layout=layout_in_circle, main="Circle")
plot(igraph, layout=layout_as_star, main="Star")
plot(igraph, layout=layout_as_tree, main="Tree")
plot(igraph, layout=layout_on_grid, main="Grid")
plot(igraph, layout=layout_with_fr, main="Force-directed")
# Note that each of these is actually just a matrix of (x,y) locations for each node.
l <- layout_randomly(igraph)
str(l)
# The most popular layouts are force-directed. These algorithms, such as Fruchterman-Reingold,
# try to position the nodes so that the edges have similar length and there are as few crossing edges as possible.
# The idea is to generate “clean” layouts, where nodes that are closer to each other share more connections in common
à that those that are located further apart. Note that this is a non-deterministic algorithm:
# choosing a different seed will generate different layouts.
par(mfrow=c(1,2))
set.seed(777)
fr <- layout_with_fr(igraph, niter=1000)
par(mar=c(0,0,0,0))
plot(igraph, layout=fr)
set.seed(666)
fr2 <- layout_with_fr(igraph, niter=1000)
par(mar=c(0,0,0,0))
plot(igraph, layout=fr2)
# Another popular layout is the graphopt algorithm created by Michael Schmuhl
par(mfrow=c(1,2))
set.seed(777)
gh <- layout_with_graphopt(igraph, niter=1000)
par(mar=c(0,0,0,0))
plot(igraph, layout=gh)
set.seed(666)
gh2 <- layout_with_graphopt(igraph, niter=1000)
par(mar=c(0,0,0,0))
plot(igraph, layout=gh2)
# another alternative is layout.fruchterman.reingold
####################################################################
####################################################################
Other ways to represent a network
####################################################################
####################################################################
# heatmap of the network matrix
netm <- get.adjacency(igraph, attr="weight", sparse=F)
str(netm)
palf <- colorRampPalette(c("gold", "dark orange"))
heatmap(netm[,1:7], Rowv = NA, Colv = NA, col = palf(100), scale="none", margins=c(10,10) )
####################################################################
####################################################################
Descriptive analysis about a network
####################################################################
####################################################################
el2 <- as.data.frame(cbind(sender = tolower(letters$source), receiver = tolower(letters$destination)))
head(el2)
str(el2)
# number of times that sender and receiver appears together (that is, number of times a letter goest from A to B)
el2 <- count(el2, sender, receiver)
str(el2)
colnames(el2)[3] <- "weight"
table(el2$weight)
el2[1:5,] # show the first 5 edges in the edgelist
print(el2[c("sender", "receiver", "weight")])
igraph <- graph_from_data_frame(el2, directed = TRUE)
igraph
# What are the most important nodes in a network? What is the propensity of two nodes that are connected
# to be both connected to a third node? What are the different hidden communities in a network?
# Let's try to answer to these questions!
####################################################################
# Measuring node importance
####################################################################
# Node properties
# We’ll start with descriptive statistics at the node level. All of these are in some way measures of importance or centrality.
# The most basic measure is DEGREE, the number of adjacent edges to each node.
# It is often considered a measure of direct influence.
# In our letter network, it will be the unique number of cities that each city is interacting with.
degree(igraph)
sort(degree(igraph))
# In directed graphs, as in ours, there are three types of degree:
# indegree (incoming edges), outdegree (outgoing edges), and total degree.
# You can find these using mode="in" or mode="out" or mode="total".
degree(igraph, mode="in")
degree(igraph, mode="out")
degree(igraph, mode="total")
deg <- degree(igraph)
plot(igraph, vertex.size=deg*3)
hist(deg, main="Histogram of node degree", breaks = 7)
# STRENGHT (that we have already seen!) is a weighted measure of degree that takes into account the number of edges that
# go from one node to another. In this network, it will be the total number of interactions of each city with anyone else.
igraph[]
sort(strength(igraph))
strength(igraph, mode="in")
strength(igraph, mode="out")
# CLOSENESS measures how many steps are required to access every other node from a given node.
# It’s a measure of how long information takes to arrive (who hears news first?). Higher values mean less centrality.
plot(igraph)
sort(closeness(igraph, normalized=TRUE))
which.max(closeness(igraph, normalized=TRUE))
max(closeness(igraph, normalized=TRUE))
# BETWEENNESS measures brokerage or gatekeeping potential.
# It is (approximately) the number of shortest paths between nodes that pass through a particular node.
sort(betweenness(igraph))
# EIGENVECTOR CENTRALITY is a measure of being well-connected connected to the well-connected.
# First eigenvector of the graph adjacency matrix. [Only works with undirected networks? se non metti directed=TRUE, default is FALSE]
centrality <- eigen_centrality(igraph, directed = TRUE)
sort(centrality$vector)
# PAGE RANK approximates probability that any message will arrive to a particular node.
# This algorithm was developed by Google founders, and originally applied to website links.
sort(page_rank(igraph)$vector)
# The HUBS and AUTHORITY algorithm developed by Jon Kleinberg was initially used to examine web pages.
# Hubs were expected to contain catalogs with a large number of outgoing links;
# while authorities would get many incoming links from hubs, presumably because of their high-quality relevant information.
# AUTHORITY score is therefore measure of centrality initially applied to the Web.
# A node has high authority when it is linked by many other nodes that are linking many other nodes.
sort(authority_score(igraph)$vector)
# authority_score(igraph)$vector
# authority_score(igraph, weights=NA)$vector
hs <- hub_score(igraph, weights=NA)$vector
as <- authority_score(igraph, weights=NA)$vector
par(mfrow=c(1,2))
plot(igraph, vertex.size=hs*50, main="Hubs")
plot(igraph, vertex.size=as*30, main="Authorities")
# Summing up
sort(degree(igraph))
sort(strength(igraph))
sort(closeness(igraph, normalized=TRUE))
sort(betweenness(igraph))
sort(centrality$vector)
sort(page_rank(igraph)$vector)
sort(authority_score(igraph)$vector)
# Finally, not exactly a measure of centrality, but we can learn more about who each node is connected to by using the following functions:
# neighbors (for direct neighbors) and ego (for neighbors up to n neighbors away)
V(igraph)$name
plot(igraph)
igraph[]
neighbors(igraph, v=which(V(igraph)$name=="bremen"))
ego(igraph, order=1, nodes=which(V(igraph)$name=="bremen"))
ego(igraph, order=2, nodes=which(V(igraph)$name=="bremen"))
neighbors(igraph, v=which(V(igraph)$name=="antwerp"))
ego(igraph, order=1, nodes=which(V(igraph)$name=="antwerp"))
ego(igraph, order=2, nodes=which(V(igraph)$name=="antwerp"))
####################################################################
# Measuring Network properties
####################################################################
# Let’s now try to describe what a network looks like as a whole. We can start with measures of the size of a network.
# DIAMETER is the length of the longest path (in number of edges) between two nodes.
# To find the diameter of a graph, first find the shortest path between each pair of vertices.
# The greatest length of any of these paths is the diameter of the graph.
# [We can use get_diameter to identify this path] MEAN_DISTANCE is the average number of edges between any two nodes in the network.
# We can find each of these paths between pairs of edges with distances.
plot(igraph)
diameter(igraph, directed=TRUE, weights=NA)
# Bremen - Delft is 3 and not 1 cause there is no direct link that goes from Delft to Bremen, but only viceversa
get_diameter(igraph, directed=TRUE, weights=NA)
# look at the differences if directed=TRUE or directed=FALSE!
mean_distance(igraph, directed=TRUE)
mean_distance(igraph, directed=FALSE)
dist <- distances(igraph, weights=NA, mode = c("out"))
dist[1:7, 1:7]
# We can extract the distances to a node or set of nodes we are interested in. Here we will get the distance of every city from Venice.
igraph
dist.from.venice <- distances(igraph, v=V(igraph)[name=="venice"], to=V(igraph), weights=NA, mode = c("out"))
# Set colors to plot the distances:
oranges <- colorRampPalette(c("dark red", "gold"))
col <- oranges(max(dist.from.venice )+1)
col <- col[dist.from.venice +1]
par(mfrow=c(1, 2))
set.seed(777)
plot(igraph)
set.seed(777)
plot(igraph, vertex.color=col, vertex.label=dist.from.venice , edge.arrow.size=.6, vertex.label.color="white")
# We can also find the shortest path between specific nodes. Say here between bremen and haarlem:
city.path <- shortest_paths(igraph, from = V(igraph)[name=="bremen"], to = V(igraph)[name=="haarlem"], output = "both") # both path nodes and edges
# Generate edge color variable to plot the path:
ecol <- rep("gray80", ecount(igraph))
ecol[unlist(city.path$epath)] <- "orange"
# Generate edge width variable to plot the path:
ew <- rep(2, ecount(igraph))
ew[unlist(city.path$epath)] <- 4
# Generate node color variable to plot the path:
vcol <- rep("gray40", vcount(igraph))
vcol[unlist(city.path$vpath)] <- "gold"
par(mfrow=c(1, 2))
set.seed(777)
plot(igraph)
set.seed(777)
plot(igraph, vertex.color=vcol, edge.color=ecol, edge.width=ew, edge.arrow.mode=0)
# EDGE_DENSITY is the proportion of edges in the network over all possible edges that could exist.
edge_density(igraph)
ecount(igraph)/(vcount(igraph)*(vcount(igraph)-1)) #for a directed network
# 7*6 possible edges (divide by 2 if you have an undirected networ) = 42 possible edges
# but only 18 exist
igraph[]
18/(7*6)
# RECIPROCITY measures the propensity of each edge to be a mutual edge;
# that is, the probability that if i is connected to j, j is also connected to i.
reciprocity(igraph)
# TRANSITIVITY, also known as clustering coefficient, measures that probability that adjacent nodes of a network are connected.
# In other words, if i is connected to j, and j is connected to k, what is the probability that i is also connected to k?
transitivity(igraph)
# The CENTRALIZATION of any network is a measure of how central its most central node is in relation to how central all the other nodes are
centr_degree(igraph, mode = c("in"), loops = TRUE,normalized = TRUE)$centralization
####################################################################
# Network communities
####################################################################
# Networks often have different clusters or communities of nodes that are more densely connected to each other than to the rest of the network.
# Let’s cover some of the different existing methods to identify these communities.
# The most straightforward way to partition a network is into connected components.
# Each component is a group of nodes that are connected to each other, but not to the rest of the nodes.
# For example, this network has 1 component [you would have for example 2 components if you had a node with no edge with any other nodes]
components(igraph)
par(mar=c(0,0,0,0))
plot(igraph)
# Most networks have a single giant connected component that includes most nodes (in our case: all nodes!).
# Most studies of networks actually focus on the GIANT COMPONENT
# (e.g. the shortest path between nodes in a network with two or more component is Inf!).
giant <- decompose(igraph)[[1]]
# Components can be weakly connected (in undirected networks) or
# strongly connected (in directed networks, where there is an edge that ends in every single node of that component).
# Even within a giant component, there can be different subsets of the network that are more connected to each other
# than to the rest of the network. The goal of community detection algorithms is to identify these subsets.
# There are a few different algorithms, each following a different logic.
# The walktrap algorithm finds communities through a series of short random walks.
# The idea is that these random walks tend to stay within the same community.
# The length of these random walks is 4 edges by default, but you may want to experiment with different values.
# The goal of this algorithm is to identify the partition that maximizes a modularity score.
cluster_walktrap(giant)
cluster_walktrap(giant, steps=10)
# igraph also makes it very easy to plot the resulting communities:
comm <- cluster_walktrap(giant)
modularity(comm) # modularity score
par(mar=c(0,0,0,0))
plot(comm, giant)
# Alternatively, we can also add to the graph some of the chacateristics of our nodes
V(giant)$color <- membership(comm)
par(mar=c(0,0,0,0))
plot(giant)
# Other methods are:
# The edge-betweenness method iteratively (recalculating at each step) removes edges with high betweenness, with the idea that they are likely
# to connect different parts. The best partitioning of the network is selected.
cluster_edge_betweenness(giant)
ceb <- cluster_edge_betweenness(giant)
dendPlot(ceb, mode="hclust")
plot(ceb, giant)
# Let’s examine the community detection igraph object:
length(ceb) # number of communities
membership(ceb) # community membership for each node
# High modularity for a partitioning reflects dense connections within communities and sparse connections across communities.
modularity(ceb) # how modular the graph partitioning is
crossing(ceb, giant) # boolean vector: TRUE for edges across communities
# Only for UNIDIRECT network: The label propagation method labels each node with unique labels in a random way, and then updates these labels by choosing the label assigned
# to the majority of their neighbors, and repeat this iteratively until each node has the most common labels among its neighbors.
cluster_label_prop(giant)
clp <- cluster_label_prop(giant)
plot(clp, giant)
# The fast and greedy method tries to directly optimize this modularity score (but for UNIDIRECT graphs only)
cluster_fast_greedy(giant)
# The infomap method attempts to map the flow of information in a network, and the different clusters in which information may get remain
# for longer periods. Similar to walktrap, but not necessarily maximizing modularity, but rather the so-called “map equation”.
# of the network. Here betweenness (gatekeeping potential) applies to edges, but the intuition is the same.
cluster_infomap(giant)
cli <- cluster_infomap(giant)
plot(cli, giant)
# My experience is that infomap tends to work better in most social science examples (websites, social media, classrooms, etc),
# but fastgreedy is faster.
# Another way in which we can think about network communities is in terms of hierarchy or structure. We’ll discuss one of these methods.
# K-core decomposition allows us to identify the core and the periphery of the network.
# A k-core is a maximal subnet of a network such that all nodes have at least degree K.
coreness(igraph)
which(coreness(igraph)==4) # what is the core of the network?
which(coreness(igraph)==1) # what is the periphery of the network?
kc <- coreness(igraph, mode="all")
plot(igraph, vertex.size=kc*6, vertex.label=kc)
# Visualizing network structure
V(igraph)$coreness <- coreness(igraph)
par(mfrow=c(2, 1), mar=c(0.1,0.1,1,0.1))
set.seed(777); fr <- layout_with_fr(igraph)
for (k in 1:2){
V(igraph)$color <- ifelse(V(igraph)$coreness>=k, "orange", "grey")
plot(igraph, main=paste0(k, '-core shell'), layout=fr)
}
# Homophily: the tendency of nodes to connect to others who are similar on some variable.
# assortativity_nominal() is for categorical variables (labels)
# assortativity() is for ordinal variables
# assortativity_degree() checks assortativity in node degrees
# The assortativity coefficient is positive if similar vertices (based on some external property)
# tend to connect to each, and negative otherwise (kind of correlation)
# create vectors with characters in each side
holland <- c("antwerp", "lisse", "haarlem")
germany <- c("bremen", "hamburg")
other <- c("venice", "delft ")
# we'll create a new variable as a node property called state
V(igraph)$state [V(igraph)$name %in% holland] <- 1
V(igraph)$state [V(igraph)$name %in% germany] <- 2
V(igraph)$state [V(igraph)$name %in% other] <- 3
igraph
V(igraph)
V(igraph)$name
str(V(igraph)$name)
str(V(igraph)$state)
V(igraph)$name
V(igraph)$state
assortativity(igraph, V(igraph)$state, directed=TRUE)
assortativity_degree(igraph, directed=TRUE)
# node we'll create a new color variable as a node property
V(igraph)$color <- NA
V(igraph)$color[V(igraph)$name %in% holland] <- "red"
V(igraph)$color[V(igraph)$name %in% germany] <- "gold"
V(igraph)$color[V(igraph)$name %in% other] <- "grey"
vertex_attr(igraph)
par(mar=c(0,0,0,0))
plot(igraph)
# If we want to indicate what the colors correspond to, we can add a legend.
par(mar=c(0,0,0,0))
plot(igraph)
legend(x=.75, y=.75, legend=c("Holland", "Germany", "Other"),
pch=21, pt.bg=c("red", "gold", "grey"), pt.cex=2, bty="n")