rm(list=ls(all=TRUE)) getwd() setwd("C:/Users/mw/Dropbox (Personale)/TOPIC MODEL") getwd() library(tidyverse) library(dplyr) library(igraph) library(hrbrthemes) library(ggraph) library(tidyverse) letters <- read.csv("network_ex.csv") letters str(letters) # change the name of the columns and converting everything to lower case (not needed of course...) el2 <- as.data.frame(cbind(sender = tolower(letters$source), receiver = tolower(letters$destination))) str(el2) # number of times that sender and receiver appears together (that is, the number of times a letter goest from A to B) el2 <- count(el2, sender, receiver) str(el2) colnames(el2)[3] <- "weight" table(el2$weight) el2[1:5,] #show the first 5 edges in the edgelist print(el2[c("sender", "receiver", "weight")]) # let's pass the matrix to igraph to create our network # Note that if the network is unidirected, then it doesn't matter which vertex/node is in the 'V1' (i.e., first variable name) versus 'V2' columns. # However, if the network is directed, then the convention is that the edge goes FROM V1 TO V2. # here note that directed=TRUE; otherwise write: directed=FALSE # Remember that if your vertex/node IDs are numbers, it is better to designate them as characters (or factors) # or else igraph can get confused igraph <- graph_from_data_frame(el2, directed = TRUE) igraph # What does it mean? - D means directed (otherwise you would have read U for unidirected) # N means named graph # W means weighted graph # 7 is the number of nodes # 18 is the number of edges # name (v/c) means name is a node attribute and it’s a character # weight (e/n) means weight is an edge attribute and it’s numeric (you can also add other edge attributes if you want as separated columns!) # This is how you access specific elements within the igraph object: vcount(igraph) # number of vertices (nodes) V(igraph) # nodes V(igraph)$name # names of each node vertex_attr(igraph) # all attributes of the nodes ecount(igraph) # number of edges E(igraph) # edges E(igraph)$weight # weights for each edge # getting the weight for each of the edges in the list of edges that originate from node number 5 E(igraph) [ from(5) ]$weight # getting the weight for each of the edges in the list of edges that end onto node number 3 E(igraph) [ to(3) ]$weight # getting the weight of the edge that simple connects just node 5 and node 3 E(igraph)[5 %--% 3]$weight edge_attr(igraph) # all attributes of the edges # you can also create the corresponding adjacency matrix igraph[] # adjacency matrix; note: 3 letters from antwerp to lisse, but just 1 from lisse to antwerp igraph[1,] # first row of adjacency matrix #################################################################### # Network visualization #################################################################### # mar is A numerical vector of the form c(bottom, left, top, right) which gives the number of lines of margin # to be specified on the four sides of the plot par(mar=c(0,0,0,0)) plot(igraph) # (look at the arrows! if you have a directional network, the arrows are on both sides!) par(mar=c(0,0,0,0)) plot(igraph, vertex.color = "grey", # change color of nodes vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 1, # increase the size of the edges vertex.shape="square" ) # change the shape of the nodes as a square # Now imagine that we want to modify some of these plotting attributes so that they are function of network properties # For example, we can change the width of the edges according to their weight par(mar=c(0,0,0,0)) plot(igraph, vertex.color = "grey", # change color of nodes vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 1, # increase the size of the edges vertex.shape="square", # change the shape of the nodes as a square edge.width=E(igraph)$weight) # add the weight to the edges par(mar=c(0,0,0,0)) plot(igraph, vertex.color = "grey", # change color of nodes vertex.label.color = "black", # change color of labels vertex.label.cex = .75, # change size of labels to 75% of original size edge.curved=.25, # add a 25% curve to the edges edge.color="grey20", # change edge color to grey edge.arrow.size = 1, # increase the size of the edges vertex.shape="square", # change the shape of the nodes as a square edge.width=E(igraph)$weight, # add the weight to the edges edge.label = el2$weight, # show the label of the weights along the edges [take a look now at the relationship between antwerp to lisse] edge.label.font=2, # font of the label (2=bold) edge.label.cex=2, edge.label.color="red") # colour of the label # We can also change the colors of each node based on some of their characteristics # create vectors with characters in each side holland <- c("antwerp", "lisse", "haarlem") germany <- c("bremen", "hamburg") other <- c("venice", "delft ") # node we'll create a new color variable as a node property V(igraph)$color <- NA V(igraph)$color[V(igraph)$name %in% holland] <- "red" V(igraph)$color[V(igraph)$name %in% germany] <- "gold" V(igraph)$color[V(igraph)$name %in% other] <- "grey" vertex_attr(igraph) par(mar=c(0,0,0,0)) plot(igraph) # If we want to indicate what the colors correspond to, we can add a legend. par(mar=c(0,0,0,0)) plot(igraph) legend(x=.75, y=.75, legend=c("Holland", "Germany", "Other"), pch=21, pt.bg=c("red", "gold", "grey"), pt.cex=2, bty="n") # Another common adjustment is to change the size of the nodes and node labels so that they match their weight igraph[] strength(igraph) strength(igraph, mode="out") strength(igraph, mode="in") V(igraph)$size <- strength(igraph) par(mar=c(0,0,0,0)) plot(igraph) # Now we’re only going to show the labels of character that gets more tha 9 letters V(igraph)$label <- ifelse( strength(igraph)>=10, V(igraph)$name, NA ) par(mar=c(0,0,0,0)) plot(igraph) # Up to now, each time we run the plot function, the nodes appear to be in a different location. # Why? Because it’s running a probabilistic function trying to locate them in the optimal way possible. # However, we can also specify the layout for the plot; that is, the (x,y) coordinates where each node will be placed. # igraph has a few different layouts built-in, that will use different algorithms to find an optimal distribution of nodes. # The following code illustrates some of these: par(mfrow=c(2, 3), mar=c(0,0,1,0)) plot(igraph, layout=layout_randomly, main="Random") plot(igraph, layout=layout_in_circle, main="Circle") plot(igraph, layout=layout_as_star, main="Star") plot(igraph, layout=layout_as_tree, main="Tree") plot(igraph, layout=layout_on_grid, main="Grid") plot(igraph, layout=layout_with_fr, main="Force-directed") # Note that each of these is actually just a matrix of (x,y) locations for each node. l <- layout_randomly(igraph) str(l) # The most popular layouts are force-directed. These algorithms, such as Fruchterman-Reingold, # try to position the nodes so that the edges have similar length and there are as few crossing edges as possible. # The idea is to generate “clean” layouts, where nodes that are closer to each other share more connections in common à that those that are located further apart. Note that this is a non-deterministic algorithm: # choosing a different seed will generate different layouts. par(mfrow=c(1,2)) set.seed(777) fr <- layout_with_fr(igraph, niter=1000) par(mar=c(0,0,0,0)) plot(igraph, layout=fr) set.seed(666) fr2 <- layout_with_fr(igraph, niter=1000) par(mar=c(0,0,0,0)) plot(igraph, layout=fr2) # Another popular layout is the graphopt algorithm created by Michael Schmuhl par(mfrow=c(1,2)) set.seed(777) gh <- layout_with_graphopt(igraph, niter=1000) par(mar=c(0,0,0,0)) plot(igraph, layout=gh) set.seed(666) gh2 <- layout_with_graphopt(igraph, niter=1000) par(mar=c(0,0,0,0)) plot(igraph, layout=gh2) # another alternative is layout.fruchterman.reingold #################################################################### #################################################################### Other ways to represent a network #################################################################### #################################################################### # heatmap of the network matrix netm <- get.adjacency(igraph, attr="weight", sparse=F) str(netm) palf <- colorRampPalette(c("gold", "dark orange")) heatmap(netm[,1:7], Rowv = NA, Colv = NA, col = palf(100), scale="none", margins=c(10,10) ) #################################################################### #################################################################### Descriptive analysis about a network #################################################################### #################################################################### el2 <- as.data.frame(cbind(sender = tolower(letters$source), receiver = tolower(letters$destination))) head(el2) str(el2) # number of times that sender and receiver appears together (that is, number of times a letter goest from A to B) el2 <- count(el2, sender, receiver) str(el2) colnames(el2)[3] <- "weight" table(el2$weight) el2[1:5,] # show the first 5 edges in the edgelist print(el2[c("sender", "receiver", "weight")]) igraph <- graph_from_data_frame(el2, directed = TRUE) igraph # What are the most important nodes in a network? What is the propensity of two nodes that are connected # to be both connected to a third node? What are the different hidden communities in a network? # Let's try to answer to these questions! #################################################################### # Measuring node importance #################################################################### # Node properties # We’ll start with descriptive statistics at the node level. All of these are in some way measures of importance or centrality. # The most basic measure is DEGREE, the number of adjacent edges to each node. # It is often considered a measure of direct influence. # In our letter network, it will be the unique number of cities that each city is interacting with. degree(igraph) sort(degree(igraph)) # In directed graphs, as in ours, there are three types of degree: # indegree (incoming edges), outdegree (outgoing edges), and total degree. # You can find these using mode="in" or mode="out" or mode="total". degree(igraph, mode="in") degree(igraph, mode="out") degree(igraph, mode="total") deg <- degree(igraph) plot(igraph, vertex.size=deg*3) hist(deg, main="Histogram of node degree", breaks = 7) # STRENGHT (that we have already seen!) is a weighted measure of degree that takes into account the number of edges that # go from one node to another. In this network, it will be the total number of interactions of each city with anyone else. igraph[] sort(strength(igraph)) strength(igraph, mode="in") strength(igraph, mode="out") # CLOSENESS measures how many steps are required to access every other node from a given node. # It’s a measure of how long information takes to arrive (who hears news first?). Higher values mean less centrality. plot(igraph) sort(closeness(igraph, normalized=TRUE)) which.max(closeness(igraph, normalized=TRUE)) max(closeness(igraph, normalized=TRUE)) # BETWEENNESS measures brokerage or gatekeeping potential. # It is (approximately) the number of shortest paths between nodes that pass through a particular node. sort(betweenness(igraph)) # EIGENVECTOR CENTRALITY is a measure of being well-connected connected to the well-connected. # First eigenvector of the graph adjacency matrix. [Only works with undirected networks? se non metti directed=TRUE, default is FALSE] centrality <- eigen_centrality(igraph, directed = TRUE) sort(centrality$vector) # PAGE RANK approximates probability that any message will arrive to a particular node. # This algorithm was developed by Google founders, and originally applied to website links. sort(page_rank(igraph)$vector) # The HUBS and AUTHORITY algorithm developed by Jon Kleinberg was initially used to examine web pages. # Hubs were expected to contain catalogs with a large number of outgoing links; # while authorities would get many incoming links from hubs, presumably because of their high-quality relevant information. # AUTHORITY score is therefore measure of centrality initially applied to the Web. # A node has high authority when it is linked by many other nodes that are linking many other nodes. sort(authority_score(igraph)$vector) # authority_score(igraph)$vector # authority_score(igraph, weights=NA)$vector hs <- hub_score(igraph, weights=NA)$vector as <- authority_score(igraph, weights=NA)$vector par(mfrow=c(1,2)) plot(igraph, vertex.size=hs*50, main="Hubs") plot(igraph, vertex.size=as*30, main="Authorities") # Summing up sort(degree(igraph)) sort(strength(igraph)) sort(closeness(igraph, normalized=TRUE)) sort(betweenness(igraph)) sort(centrality$vector) sort(page_rank(igraph)$vector) sort(authority_score(igraph)$vector) # Finally, not exactly a measure of centrality, but we can learn more about who each node is connected to by using the following functions: # neighbors (for direct neighbors) and ego (for neighbors up to n neighbors away) V(igraph)$name plot(igraph) igraph[] neighbors(igraph, v=which(V(igraph)$name=="bremen")) ego(igraph, order=1, nodes=which(V(igraph)$name=="bremen")) ego(igraph, order=2, nodes=which(V(igraph)$name=="bremen")) neighbors(igraph, v=which(V(igraph)$name=="antwerp")) ego(igraph, order=1, nodes=which(V(igraph)$name=="antwerp")) ego(igraph, order=2, nodes=which(V(igraph)$name=="antwerp")) #################################################################### # Measuring Network properties #################################################################### # Let’s now try to describe what a network looks like as a whole. We can start with measures of the size of a network. # DIAMETER is the length of the longest path (in number of edges) between two nodes. # To find the diameter of a graph, first find the shortest path between each pair of vertices. # The greatest length of any of these paths is the diameter of the graph. # [We can use get_diameter to identify this path] MEAN_DISTANCE is the average number of edges between any two nodes in the network. # We can find each of these paths between pairs of edges with distances. plot(igraph) diameter(igraph, directed=TRUE, weights=NA) # Bremen - Delft is 3 and not 1 cause there is no direct link that goes from Delft to Bremen, but only viceversa get_diameter(igraph, directed=TRUE, weights=NA) # look at the differences if directed=TRUE or directed=FALSE! mean_distance(igraph, directed=TRUE) mean_distance(igraph, directed=FALSE) dist <- distances(igraph, weights=NA, mode = c("out")) dist[1:7, 1:7] # We can extract the distances to a node or set of nodes we are interested in. Here we will get the distance of every city from Venice. igraph dist.from.venice <- distances(igraph, v=V(igraph)[name=="venice"], to=V(igraph), weights=NA, mode = c("out")) # Set colors to plot the distances: oranges <- colorRampPalette(c("dark red", "gold")) col <- oranges(max(dist.from.venice )+1) col <- col[dist.from.venice +1] par(mfrow=c(1, 2)) set.seed(777) plot(igraph) set.seed(777) plot(igraph, vertex.color=col, vertex.label=dist.from.venice , edge.arrow.size=.6, vertex.label.color="white") # We can also find the shortest path between specific nodes. Say here between bremen and haarlem: city.path <- shortest_paths(igraph, from = V(igraph)[name=="bremen"], to = V(igraph)[name=="haarlem"], output = "both") # both path nodes and edges # Generate edge color variable to plot the path: ecol <- rep("gray80", ecount(igraph)) ecol[unlist(city.path$epath)] <- "orange" # Generate edge width variable to plot the path: ew <- rep(2, ecount(igraph)) ew[unlist(city.path$epath)] <- 4 # Generate node color variable to plot the path: vcol <- rep("gray40", vcount(igraph)) vcol[unlist(city.path$vpath)] <- "gold" par(mfrow=c(1, 2)) set.seed(777) plot(igraph) set.seed(777) plot(igraph, vertex.color=vcol, edge.color=ecol, edge.width=ew, edge.arrow.mode=0) # EDGE_DENSITY is the proportion of edges in the network over all possible edges that could exist. edge_density(igraph) ecount(igraph)/(vcount(igraph)*(vcount(igraph)-1)) #for a directed network # 7*6 possible edges (divide by 2 if you have an undirected networ) = 42 possible edges # but only 18 exist igraph[] 18/(7*6) # RECIPROCITY measures the propensity of each edge to be a mutual edge; # that is, the probability that if i is connected to j, j is also connected to i. reciprocity(igraph) # TRANSITIVITY, also known as clustering coefficient, measures that probability that adjacent nodes of a network are connected. # In other words, if i is connected to j, and j is connected to k, what is the probability that i is also connected to k? transitivity(igraph) # The CENTRALIZATION of any network is a measure of how central its most central node is in relation to how central all the other nodes are centr_degree(igraph, mode = c("in"), loops = TRUE,normalized = TRUE)$centralization #################################################################### # Network communities #################################################################### # Networks often have different clusters or communities of nodes that are more densely connected to each other than to the rest of the network. # Let’s cover some of the different existing methods to identify these communities. # The most straightforward way to partition a network is into connected components. # Each component is a group of nodes that are connected to each other, but not to the rest of the nodes. # For example, this network has 1 component [you would have for example 2 components if you had a node with no edge with any other nodes] components(igraph) par(mar=c(0,0,0,0)) plot(igraph) # Most networks have a single giant connected component that includes most nodes (in our case: all nodes!). # Most studies of networks actually focus on the GIANT COMPONENT # (e.g. the shortest path between nodes in a network with two or more component is Inf!). giant <- decompose(igraph)[[1]] # Components can be weakly connected (in undirected networks) or # strongly connected (in directed networks, where there is an edge that ends in every single node of that component). # Even within a giant component, there can be different subsets of the network that are more connected to each other # than to the rest of the network. The goal of community detection algorithms is to identify these subsets. # There are a few different algorithms, each following a different logic. # The walktrap algorithm finds communities through a series of short random walks. # The idea is that these random walks tend to stay within the same community. # The length of these random walks is 4 edges by default, but you may want to experiment with different values. # The goal of this algorithm is to identify the partition that maximizes a modularity score. cluster_walktrap(giant) cluster_walktrap(giant, steps=10) # igraph also makes it very easy to plot the resulting communities: comm <- cluster_walktrap(giant) modularity(comm) # modularity score par(mar=c(0,0,0,0)) plot(comm, giant) # Alternatively, we can also add to the graph some of the chacateristics of our nodes V(giant)$color <- membership(comm) par(mar=c(0,0,0,0)) plot(giant) # Other methods are: # The edge-betweenness method iteratively (recalculating at each step) removes edges with high betweenness, with the idea that they are likely # to connect different parts. The best partitioning of the network is selected. cluster_edge_betweenness(giant) ceb <- cluster_edge_betweenness(giant) dendPlot(ceb, mode="hclust") plot(ceb, giant) # Let’s examine the community detection igraph object: length(ceb) # number of communities membership(ceb) # community membership for each node # High modularity for a partitioning reflects dense connections within communities and sparse connections across communities. modularity(ceb) # how modular the graph partitioning is crossing(ceb, giant) # boolean vector: TRUE for edges across communities # Only for UNIDIRECT network: The label propagation method labels each node with unique labels in a random way, and then updates these labels by choosing the label assigned # to the majority of their neighbors, and repeat this iteratively until each node has the most common labels among its neighbors. cluster_label_prop(giant) clp <- cluster_label_prop(giant) plot(clp, giant) # The fast and greedy method tries to directly optimize this modularity score (but for UNIDIRECT graphs only) cluster_fast_greedy(giant) # The infomap method attempts to map the flow of information in a network, and the different clusters in which information may get remain # for longer periods. Similar to walktrap, but not necessarily maximizing modularity, but rather the so-called “map equation”. # of the network. Here betweenness (gatekeeping potential) applies to edges, but the intuition is the same. cluster_infomap(giant) cli <- cluster_infomap(giant) plot(cli, giant) # My experience is that infomap tends to work better in most social science examples (websites, social media, classrooms, etc), # but fastgreedy is faster. # Another way in which we can think about network communities is in terms of hierarchy or structure. We’ll discuss one of these methods. # K-core decomposition allows us to identify the core and the periphery of the network. # A k-core is a maximal subnet of a network such that all nodes have at least degree K. coreness(igraph) which(coreness(igraph)==4) # what is the core of the network? which(coreness(igraph)==1) # what is the periphery of the network? kc <- coreness(igraph, mode="all") plot(igraph, vertex.size=kc*6, vertex.label=kc) # Visualizing network structure V(igraph)$coreness <- coreness(igraph) par(mfrow=c(2, 1), mar=c(0.1,0.1,1,0.1)) set.seed(777); fr <- layout_with_fr(igraph) for (k in 1:2){ V(igraph)$color <- ifelse(V(igraph)$coreness>=k, "orange", "grey") plot(igraph, main=paste0(k, '-core shell'), layout=fr) } # Homophily: the tendency of nodes to connect to others who are similar on some variable. # assortativity_nominal() is for categorical variables (labels) # assortativity() is for ordinal variables # assortativity_degree() checks assortativity in node degrees # The assortativity coefficient is positive if similar vertices (based on some external property) # tend to connect to each, and negative otherwise (kind of correlation) # create vectors with characters in each side holland <- c("antwerp", "lisse", "haarlem") germany <- c("bremen", "hamburg") other <- c("venice", "delft ") # we'll create a new variable as a node property called state V(igraph)$state [V(igraph)$name %in% holland] <- 1 V(igraph)$state [V(igraph)$name %in% germany] <- 2 V(igraph)$state [V(igraph)$name %in% other] <- 3 igraph V(igraph) V(igraph)$name str(V(igraph)$name) str(V(igraph)$state) V(igraph)$name V(igraph)$state assortativity(igraph, V(igraph)$state, directed=TRUE) assortativity_degree(igraph, directed=TRUE) # node we'll create a new color variable as a node property V(igraph)$color <- NA V(igraph)$color[V(igraph)$name %in% holland] <- "red" V(igraph)$color[V(igraph)$name %in% germany] <- "gold" V(igraph)$color[V(igraph)$name %in% other] <- "grey" vertex_attr(igraph) par(mar=c(0,0,0,0)) plot(igraph) # If we want to indicate what the colors correspond to, we can add a legend. par(mar=c(0,0,0,0)) plot(igraph) legend(x=.75, y=.75, legend=c("Holland", "Germany", "Other"), pch=21, pt.bg=c("red", "gold", "grey"), pt.cex=2, bty="n")