5 Social network analysis

5 Social network analysis#

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

5.1 Basic concepts of network#

A network is frequently used interchangeably with a graph, but it typically highlights real-world applications and is commonly associated with social relationships (social networks), and built environments (road networks).
A graph (\(G\)) is a mathematical structure used to model pairwise relations between objects. It consists of a set of vertices (nodes) and a set of edges (links) that connect pairs of vertices.

Graph Type

Graph Type	Description	NetworkX Class
Undirected Graph	A graph where edges have no direction.	Graph
Directed Graph	A graph where edges have a direction, indicated by an arrow.	DiGraph
Multi-(undirected) Graph	An undirected graph with parallel edges.	MultiGraph
Multi-directed Graph	A directed graph with parallel edges.	MultiDiGraph

Note: All types of graphs can have self-loops.

5.2 Measurements for nodes and edges#

# Create an empty undirected graph
G = nx.Graph()

# Add edge with weight
G.add_edge(1, 2, weight=30)
G.add_edge(1, 3, weight=5)
G.add_edge(2, 3, weight=22)
G.add_edge(2, 4, weight=2)
G.add_edge(3, 4, weight=37)

# Print edge infos
print(G.edges(data=True))

[(1, 2, {'weight': 30}), (1, 3, {'weight': 5}), (2, 3, {'weight': 22}), (2, 4, {'weight': 2}), (3, 4, {'weight': 37})]

# Print node infos, the {} represents the node attributes for each node, e.g, you can input {'attribute_name': 'value'}
print(G.nodes(data=True))

[(1, {}), (2, {}), (3, {}), (4, {})]

# Add node with attributes
G.add_node(1, name='A')
G.add_node(2, name='B')
G.add_node(3, name='C')
G.add_node(4, name='D')

# Print node infos
print(G.nodes(data=True))

[(1, {'name': 'A'}), (2, {'name': 'B'}), (3, {'name': 'C'}), (4, {'name': 'D'})]

# Print weight info
weights = nx.get_edge_attributes(G, 'weight')
weights

{(1, 2): 30, (1, 3): 5, (2, 3): 22, (2, 4): 2, (3, 4): 37}

#Plot the graph
pos = nx.random_layout(G, seed=42) # Random layout for node positions and k controls the distance between nodes

nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray', node_size=500, font_size=16, font_weight='bold')

# Draw edge labels (weights)
edge_labels = nx.get_edge_attributes(G, 'weight')
nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red', font_size=12)

{(1, 2): Text(0.5532658439544372, 0.7746875514598375, '30'),
 (1, 3): Text(0.26527960443106796, 0.553355212105517, '5'),
 (2, 3): Text(0.44400741321753107, 0.3773273824606946, '22'),
 (2, 4): Text(0.39503984440515744, 0.7324168859908449, '2'),
 (3, 4): Text(0.10705093762945314, 0.5110867082546724, '37')}

_images/91144004f6201f44829de0387e66ec7acb33dbdd0c56e7a935bc975a790c0394.png

5.2.1 Degree centrality of nodes #

The degree centrality values are normalized by dividing by the maximum possible degree in a simple graph n-1 where n is the number of nodes in \(G\).

# Output the degree centrality for each node
nx.degree_centrality(G)

{1: 0.6666666666666666, 2: 1.0, 3: 1.0, 4: 0.6666666666666666}

5.2.2 Closeness centrality of nodes #

Closeness centrality of a node \(u\) is the reciprocal of the average shortest path distance to \(u\) over all \(n-1\) reachable nodes.
\(G =(U,V)\).
\(C(v) = \frac{n-1}{\sum_{v =1}^{n-1} d(v, u)}\)
where \(d(v, u)\) is the shortest-path distance between v and u, and n-1 is the number of nodes reachable from \(u\).

Shortest path and distance

1–>2 : 1,3,2 (5+22 = 27)
1–>3: 1,3 (5)
1–>4: 1,3,2,4 (5+22+2=29)
2–>3: 2,3 (22)
2–>4: 2,4 (2)
3–>4: 3,2,4 (22+2=24)

# Out put all shortest path and distance for each node
for node_i in G.nodes():
    for node_j in G.nodes():
        if node_i != node_j:
            print(f"Shortest path from {node_i} to {node_j}: {nx.shortest_path(G, source=node_i, target=node_j, weight='weight')}")
            print(f"Shortest path distance from {node_i} to {node_j}: {nx.shortest_path_length(G, source=node_i, target=node_j, weight='weight')}")
            print("---------------------------")

Shortest path from 1 to 2: [1, 3, 2]
Shortest path distance from 1 to 2: 27
---------------------------
Shortest path from 1 to 3: [1, 3]
Shortest path distance from 1 to 3: 5
---------------------------
Shortest path from 1 to 4: [1, 3, 2, 4]
Shortest path distance from 1 to 4: 29
---------------------------
Shortest path from 2 to 1: [2, 3, 1]
Shortest path distance from 2 to 1: 27
---------------------------
Shortest path from 2 to 3: [2, 3]
Shortest path distance from 2 to 3: 22
---------------------------
Shortest path from 2 to 4: [2, 4]
Shortest path distance from 2 to 4: 2
---------------------------
Shortest path from 3 to 1: [3, 1]
Shortest path distance from 3 to 1: 5
---------------------------
Shortest path from 3 to 2: [3, 2]
Shortest path distance from 3 to 2: 22
---------------------------
Shortest path from 3 to 4: [3, 2, 4]
Shortest path distance from 3 to 4: 24
---------------------------
Shortest path from 4 to 1: [4, 2, 3, 1]
Shortest path distance from 4 to 1: 29
---------------------------
Shortest path from 4 to 2: [4, 2]
Shortest path distance from 4 to 2: 2
---------------------------
Shortest path from 4 to 3: [4, 2, 3]
Shortest path distance from 4 to 3: 24
---------------------------

# Output the closeness centrality for each node
# cc of node 1
print('cc node 1',(4-1) / (27 + 5 + 29))
# cc of node 2
print('cc node 2',(4-1) / (27 + 22 + 2))
# cc of node 3
print('cc node 3',(4-1) / (5 + 22 + 24))
# cc of node 4
print('cc node 4',(4-1) / (29 + 2 + 24))

cc node 1 0.04918032786885246
cc node 2 0.058823529411764705
cc node 3 0.058823529411764705
cc node 4 0.05454545454545454

# use nx to get cc
nx.closeness_centrality(G, distance='weight')

{1: 0.04918032786885246,
0.058823529411764705,
0.058823529411764705,
0.05454545454545454}

5.2.3 Betweenness centrality of nodes #

Betweenness centrality of a node \(v\) is the sum of the fraction of all-pairs shortest paths that pass through \(v\).

\(C_B(v) = \sum_{s,t \in V} \frac{\sigma({s,t |v })}{\sigma({s, t})}\)

where \(V\) is the set of nodes,
\(\sigma({s, t})\) is the number of shortest paths,
\(\sigma({s,t |v })\) is the number of those paths passing through some node \(v\) other than \({s, t}\).
\(\text{Normalization } C_B \text{ of nodes} = \frac{C_B \text{ of nodes}}{\text{Normalization Factor}}\)
\(\text{Normalization Factor (NF) for undirected G} = \frac{(n-1) \cdot (n-2)}{2}\) ,
\(n\) is the nodes numbers.

# Calculate the normalization factor
(4-1)*(4-2)/2

3.0

# Normalized betweenness centrality of node 1
print('NZ BC of Node 1 ', 0/3)
# Normalized betweenness centrality of node 2
print('NZ BC of Node 2 ', 0/3)
# Normalized betweenness centrality of node 3
print('NZ BC of Node 3 ', 2/3)
# Normalized betweenness centrality of node 4
print('NZ BC of Node 4 ', 0/3)

NZ BC of Node 1  0.0
NZ BC of Node 2  0.0
NZ BC of Node 3  0.6666666666666666
NZ BC of Node 4  0.0

# Get betweenness centrality using nx
nx.betweenness_centrality(G,  weight='weight', normalized=True)

{1: 0.0, 2: 0.6666666666666666, 3: 0.6666666666666666, 4: 0.0}

5.2.4 Betweenness centrality of edges #

Betweenness centrality of an edge \(e\) is the sum of the fraction of all-pairs shortest paths that pass through \(e\).

\(C_B(e) = \sum_{s,t \in V} \frac{\sigma({s,t |e})}{\sigma({s, t})}\)

where \(V\) is the set of nodes,
\(\sigma({s, t})\) is the number of shortest paths \({s, t}\),
\(\sigma({s,t |e })\) is the number of those paths passing through edge \(e\).
\(\text{Normalization } C_B \text{ of edges} = \frac{C_B \text{ of edges}}{\text{Normalization Factor}}\)
\(\text{Normalization Factor (NF) for undirected G} = \frac{n(n-1)}{2}\)
\(n\) is the nodes numbers.

# Calculate the normalization factor
(4-1)*4/2

6.0

# Normalized betweenness centrality of edge (1,2)
print('NZ BC of edge (1 ,2)', 0/6)
# Normalized betweenness centrality of edge (1,3)
print('NZ BC of edge (1 ,3)', 3/6)
# Normalized betweenness centrality of edge (2,3)
print('NZ BC of edge (2 ,3)', 4/6)
# Normalized betweenness centrality of edge (2,4)
print('NZ BC of edge (2 ,4)', 3/6)
# Normalized betweenness centrality of edge (3,4)
print('NZ BC of edge (3 ,4)', 0/6)

NZ BC of edge (1 ,2) 0.0
NZ BC of edge (1 ,3) 0.5
NZ BC of edge (2 ,3) 0.6666666666666666
NZ BC of edge (2 ,4) 0.5
NZ BC of edge (3 ,4) 0.0

# Get betweenness centrality using nx
nx.edge_betweenness_centrality(G, weight='weight', normalized=True)

{(1, 2): 0.0,
 (1, 3): 0.5,
 (2, 3): 0.6666666666666666,
 (2, 4): 0.5,
 (3, 4): 0.0}

Note: If we create a DiGraph, are measurements still the same?

Gd = nx.DiGraph()

Gd.add_edge(1, 2, weight=30)

Gd.add_edge(1, 3, weight=5)

Gd.add_edge(2, 3, weight=22)

Gd.add_edge(2, 4, weight=2)

Gd.add_edge(3, 4, weight=37)

5.3 Community detection#

Community detection is the process of identifying groups of nodes in a network that are more densely connected to each other than to the rest of the network. These groups are called communities or clusters.

In this section, we will use the real word dataset to illustrate the Link prediction in SNA. Download link

# Load the data
G = nx.read_edgelist("facebook_combined.txt.gz")
# Basic info
print(f"Graph has {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
print(f"Is the graph directed? {G.is_directed()}")

Graph has 4039 nodes and 88234 edges.
Is the graph directed? False

# We select a subgraph with 300 nodes for example.
sub_G = G.subgraph(list(G.nodes)[2300:2600])
# Basic info
print(f"Graph has {sub_G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")
print(f"Is the graph directed? {sub_G.is_directed()}")

Graph has 300 nodes and 88234 edges.
Is the graph directed? False

5.3.1 Visualization of graph#

The visualization of social network by the node degrees.

# Node sizes based on degree (higher-degree nodes are larger)
degree = dict(sub_G.degree())
node_sizes = [degree[node] * 3 for node in sub_G.nodes()]
# Node colors based on degree (higher-degree nodes get darker color)
node_colors = [degree[node] for node in sub_G.nodes()]
# Create a layout for positioning (spring layout )
pos = nx.spring_layout(sub_G, seed=42, k=0.25)  # k controls spacing
# Plot the subgraph
plt.figure(figsize=(10, 8))
nx.draw_networkx_nodes(sub_G, pos, node_size=node_sizes, cmap=plt.cm.Blues, node_color=node_colors, alpha=0.6)
nx.draw_networkx_edges(sub_G, pos, alpha=0.05)
nx.draw_networkx_labels(sub_G, pos, font_size=2, font_color="black")
# Title and display
plt.title("The visualisation of social network by the node degrees", fontsize=10)
plt.axis("off")
plt.show()

_images/f210167b557e2391e67a174c0ef76b40fc2a06c06fb3c92c3d7bf6e001e4e812.png

The visualization of social network by the edge betweenness centrality.

%%time
# Compute Edge Betweenness Centrality
edge_centrality = nx.edge_betweenness_centrality(sub_G)

CPU times: user 857 ms, sys: 2.07 ms, total: 859 ms
Wall time: 587 ms

# Normalize centrality values for visualization
centrality_values = np.array(list(edge_centrality.values()))
min_c, max_c = min(centrality_values), max(centrality_values)
norm_centrality = (centrality_values - min_c) / (max_c - min_c)  # Normalize between 0 and 1
# Use the updated colormap function
cmap = plt.colormaps.get_cmap("rainbow")
# Generate color values for edges
edge_colors = [cmap(value) for value in norm_centrality]
# Scale edge widths based on centrality
min_width, max_width = 0.1, 2
edge_widths = [min_width + (max_width - min_width) * value for value in norm_centrality]

import matplotlib.cm as cm
# Create figure and axis
fig, ax = plt.subplots(figsize=(10, 8))
# Compute node positions
pos = nx.spring_layout(sub_G, seed=42, k=0.2)
# Draw nodes
nx.draw_networkx_nodes(sub_G, pos, node_size=0.2, alpha=0.7, ax=ax)
# Draw edges with colormap
edges = list(edge_centrality.keys())
nx.draw_networkx_edges(sub_G, pos, edgelist=edges, edge_color=edge_colors, alpha=0.5, width=edge_widths, ax=ax)
# Add colorbar legend
sm = cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=min_c, vmax=max_c))
cbar = plt.colorbar(sm, ax=ax)
cbar.set_label("Edge Betweenness Centrality", fontsize=12)
# Title and display
plt.title("The visualisation of social network by edge betweenness centrality", fontsize=10)
plt.axis("off")
plt.show()

_images/6b300d46b579829dbc5bb8cf3ef21199b6062c1ed7a5ce9a4ab7070d09372736.png

5.3.2 Girvan-Newman algorithm #

Girvan-Newman algorithm is a method for detecting communities in a graph by progressively removing edges with the highest betweenness centrality. The algorithm works as follows:

Compute the betweenness centrality of all edges in the graph.
Identify the edge with the highest betweenness centrality.
Remove that edge from the graph.
Check if the graph is disconnected (i.e., it has more than one connected component).
If the graph is disconnected, the connected components are considered communities.
Repeat steps 1-5 until the desired number of communities is detected or no edges remain.
Return the detected communities.

# We use Girvan-Newman method in networkx on sub_G as an example
# The output of girvan_newman is a generator of communities, i.e., it produces multiple possible clusterings (partitions of nodes into communities).
# Each result corresponds to a different level of the hierarchy as the Girvan-Newman algorithm progressively removes edges
from networkx.algorithms.community import girvan_newman
comp = girvan_newman(sub_G)

# Get the first two types of results: two types of results of community detection
# Extract first two levels of community partitions
first_comp = next(comp)
second_comp = next(comp)

# Convert to sorted lists for readability
first_communities= [sorted(list(c)) for c in first_comp]
second_communities = [sorted(list(c)) for c in second_comp]

print("The numbers of first communities:", len(first_communities))
print("The numbers of second communities:", len(second_communities))

The numbers of first communities: 15
The numbers of second communities: 16

# We use the first communities detected from 300 nodes to visualize the communities.
# Create a color map for the communities
# Generate a color map with distinct colors for each community
cmap = plt.get_cmap("tab20c", len(first_communities))
# Create a color mapping for each community
community_colors = {}
for i, community in enumerate(first_communities):
    for node in community:
        community_colors[node] = cmap(i)
# Create a layout for positioning (spring layout )
pos = nx.spring_layout(sub_G, seed=42, k=0.3)
# Plot the subgraph
plt.figure(figsize=(10, 8))
nx.draw_networkx_nodes(sub_G, pos, node_color=[community_colors[node] for node in sub_G.nodes()], alpha=0.6)
nx.draw_networkx_edges(sub_G, pos, alpha=0.05)
nx.draw_networkx_labels(sub_G, pos, font_size=5, font_color="black")
# Plot the legend
for i, community in enumerate(first_communities):
    plt.scatter([], [], color=cmap(i), label=f"Community {i+1}")
# Add legend to the plot
plt.legend(loc="upper right", fontsize=8, markerscale=1, frameon=False)
# Title and display
plt.title("The visualisation of communities in social network", fontsize=10)
plt.axis("off")
plt.show()

_images/0e00f780be3de8cfd6ccc20d8b5d9230f22af517e3e8e0a73a22ed5244e025a5.png

5.3.3 Fluid Communities algorithm #

Fluid Communities algorithm is a method for detecting communities in a connected graph based on the concept of fluid dynamics. It works by simulating the flow of “fluid” through the network, where nodes represent containers and edges represent pipes. The algorithm identifies communities by analyzing how the fluid flows and accumulates in different regions of the network. The algorithm is particularly effective for detecting overlapping communities, where nodes can belong to multiple communities simultaneously.

# We apply the Fluid Communities method to a different graph because this method requires a connected graph, but `sub_G` is not connected.
from networkx.algorithms.community import asyn_fluidc

# Create a ladder graph (always connected)
G_e = nx.ladder_graph(200)  # 200 nodes

fluid_communities = asyn_fluidc(G_e, 8)

# Convert to sorted lists for readability
fluid_communities = [sorted(list(c)) for c in fluid_communities]
print("The number of fluid communities:", len(fluid_communities))
print("Communities:", fluid_communities)

The number of fluid communities: 8
Communities: [[118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355], [177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399], [169, 170, 171, 172, 173, 174, 175, 176, 369, 370, 371, 372, 373, 374, 375, 376], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229], [60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278], [156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368], [79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317], [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259]]

# define a function to plot the communities
def plot_communities(G_v, communities):
    # Create a color map for the communities
    cmap = plt.get_cmap("tab20c", len(communities))
    # Create a color mapping for each community
    community_colors = {}
    for i, com in enumerate(communities):
        for node in com:
            community_colors[node] = cmap(i)
    # Create a layout for positioning (spring layout )
    pos = nx.spring_layout(G_v, seed=42, k=0.15)
    # Plot the subgraph
    plt.figure(figsize=(10, 8))
    nx.draw_networkx_nodes(G_v, pos, node_color=[community_colors[node] for node in G_v.nodes()], alpha=0.6)
    nx.draw_networkx_edges(G_v, pos, alpha=0.05)
    nx.draw_networkx_labels(G_v, pos, font_size=3, font_color="black")
    # Plot the legend
    for i, com in enumerate(communities):
        plt.scatter([], [], color=cmap(i), label=f"Community {i+1}")
    # Add legend to the plot
    plt.legend(loc="upper right", fontsize=8, markerscale=1, frameon=False)
    # Title and display
    plt.title("The visualisation of communities in social network", fontsize=10)
    plt.axis("off")
    plt.show()

plot_communities(G_e, fluid_communities)

_images/2ed7fbbe88888fd9f34c8f10cdf2aac496f70fead38f6c2b238aadfa0ed5631f.png

5.3.4 Label Propagation algorithm #

Label Propagation algorithm is a method for detecting communities in a graph based on the idea of propagating labels through the network. The algorithm works as follows:

Initialize each node with a unique label.
For each node, update its label to the most frequent label among its neighbors.
Repeat step 2 until convergence (i.e., no labels change).
Return the final labels as communities.
The algorithm is efficient and can handle large networks, making it suitable for real-world applications.

# We apply the Label Propagation method to sub_G.
from networkx.algorithms.community import label_propagation_communities
# Get the communities
lp_communities = label_propagation_communities(sub_G)
# Convert to sorted lists for readability
lp_communities = [sorted(list(c)) for c in lp_communities]
print("The number of label propagation communities:", len(lp_communities))
print("Communities:", lp_communities)

The number of label propagation communities: 19
Communities: [['2723', '2727', '2728', '2730', '2733', '2735', '2737', '2739', '2742', '2744', '2747', '2748', '2751', '2753', '2758', '2762', '2765', '2768', '2770', '2772', '2778', '2779', '2780', '2781', '2783', '2784', '2785', '2787', '2790', '2791', '2793', '2795', '2796', '2797', '2801', '2802', '2803', '2804', '2811', '2812', '2816', '2818', '2819', '2820', '2823', '2825', '2828', '2829', '2830', '2831', '2832', '2833', '2840', '2843', '2844', '2845', '2846', '2847', '2848', '2849', '2853', '2855', '2856', '2859', '2863', '2865', '2868', '2870', '2871', '2875', '2876', '2878', '2881', '2882', '2894', '2895', '2901', '2904', '2914', '2918', '2921', '2923', '2932', '2934', '2936', '2937', '2939', '2942', '2947', '2950', '2951', '2955', '2958', '2965', '2970', '2973', '2975', '2980', '2984', '2989', '2990', '2991', '2992', '2995', '2996', '2998', '3002', '3006', '3007', '3009', '3013', '3015', '3016', '3017', '3018', '3021', '3024', '3027', '3032', '3035', '3038', '3039', '3041', '3043', '3046', '3048', '3049', '3050', '3051'], ['2729', '2731', '2738', '2741', '2743', '2745', '2746', '2749', '2750', '2754', '2755', '2756', '2757', '2759', '2761', '2763', '2766', '2773', '2777', '2782', '2786', '2794', '2798', '2800', '2806', '2809', '2810', '2815', '2821', '2827', '2835', '2837', '2839', '2850', '2852', '2854', '2862', '2864', '2866', '2867', '2872', '2873', '2874', '2877', '2880', '2884', '2887', '2890', '2891', '2896', '2897', '2905', '2906', '2907', '2908', '2909', '2910', '2911', '2912', '2913', '2915', '2916', '2917', '2919', '2920', '2924', '2925', '2927', '2928', '2929', '2931', '2940', '2943', '2944', '2945', '2946', '2948', '2952', '2953', '2956', '2957', '2960', '2961', '2963', '2966', '2967', '2969', '2974', '2977', '2981', '2985', '2986', '2987', '2988', '2993', '2994', '3000', '3004', '3010', '3014', '3022', '3023', '3025', '3026', '3029', '3030', '3033', '3036', '3040', '3042', '3044', '3047', '3052', '3054', '3056'], ['2726', '2769', '2789', '2807', '2851', '2888', '2899', '2978', '3045'], ['2767', '2834', '2879', '2889', '2959', '2972', '2982', '3008'], ['2926'], ['2732', '2902', '2930'], ['3031'], ['2736', '2771', '2836', '2841', '2858', '2893', '2898', '2954'], ['2792', '3037'], ['2776', '2799', '2861', '3053'], ['2805', '2886', '2922', '2933', '2935', '2949', '2983', '2997', '3012', '3028'], ['2774', '2817', '3055'], ['2824'], ['2860'], ['2842'], ['2808'], ['2857'], ['2788'], ['2900']]

plot_communities(sub_G, lp_communities)

_images/998aaf31b63b2c82a8da5331eaa86737ac44da87313f5995d546fb42fd03cf2e.png

5.3.5 Clique Percolation algorithm #

Clique Percolation algorithm is a method for detecting communities in a graph based on the concept of cliques. A clique is a subset of nodes in which every node is connected to every other node. The algorithm works as follows:

Identify all cliques of a specified size (k) in the graph.
Create a graph where each clique is a node.
Connect cliques that share at least one node.
Identify connected components in the new graph as communities.
Return the detected communities.

# We apply the Clique Percolation method to sub_G.
from networkx.algorithms.community import k_clique_communities
# Get the communities
k = 5 # Size of the cliques
k_communities = k_clique_communities(sub_G, k)
# Convert to sorted lists for readability
k_communities = [sorted(list(c)) for c in k_communities]
print("The number of clique percolation communities:", len(k_communities))
print("Communities:", k_communities)

The number of clique percolation communities: 6
Communities: [['2742', '2748', '2753', '2765', '2770', '2778', '2780', '2781', '2785', '2787', '2793', '2796', '2823', '2828', '2833', '2846', '2849', '2853', '2863', '2875', '2881', '2894', '2901', '2904', '2936', '2937', '2939', '2970', '2973', '2989', '2991', '2992', '3013', '3016', '3017', '3021', '3027', '3035', '3038', '3041', '3046', '3049', '3051'], ['2730', '2758', '2801', '2816', '2843', '2865', '2918', '2932', '2955', '3032'], ['2879', '2889', '2959', '2972', '2982', '3008'], ['2729', '2731', '2738', '2741', '2743', '2745', '2746', '2749', '2750', '2754', '2755', '2756', '2757', '2761', '2763', '2766', '2773', '2777', '2782', '2786', '2794', '2800', '2806', '2810', '2815', '2821', '2827', '2835', '2839', '2850', '2852', '2854', '2862', '2864', '2866', '2867', '2872', '2873', '2874', '2877', '2880', '2887', '2890', '2891', '2896', '2897', '2905', '2906', '2907', '2908', '2909', '2910', '2911', '2912', '2913', '2915', '2916', '2917', '2919', '2920', '2924', '2925', '2927', '2928', '2929', '2931', '2940', '2943', '2944', '2945', '2946', '2948', '2953', '2956', '2960', '2966', '2967', '2969', '2974', '2977', '2981', '2985', '2986', '2987', '2988', '2993', '2994', '3000', '3004', '3010', '3014', '3022', '3023', '3025', '3026', '3029', '3033', '3036', '3040', '3042', '3047', '3052', '3054', '3056'], ['2933', '2935', '2949', '2983', '2997', '3012', '3028'], ['2747', '2831', '2832', '2863', '2901', '3050']]

Exercise: Define a new function to draw the graph visualisation

5.3.6 Kernighan-Lin algorithm #

Kernighan-Lin algorithm is a method for detecting communities in a graph based on the concept of graph partitioning. The algorithm works as follows:

Initialize two partitions of the graph.
Compute the gain for each node in the partitions.
Swap nodes between the partitions to maximize the gain.
Repeat steps 2-3 until no further improvement can be made.
Return the final partitions as communities.

# We apply the Kernighan-Lin method to sub_G.
from networkx.algorithms.community import kernighan_lin_bisection
# Get the communities
kl_communities = kernighan_lin_bisection(sub_G)
# Convert to sorted lists for readability
kl_communities = [sorted(list(c)) for c in kl_communities]
print("The number of Kernighan-Lin communities:", len(kl_communities))
print("Communities:", kl_communities)

The number of Kernighan-Lin communities: 2
Communities: [['2726', '2729', '2731', '2738', '2741', '2743', '2745', '2746', '2749', '2750', '2754', '2755', '2756', '2757', '2759', '2761', '2763', '2766', '2769', '2773', '2774', '2776', '2777', '2782', '2786', '2788', '2789', '2792', '2794', '2798', '2800', '2805', '2806', '2807', '2808', '2809', '2810', '2815', '2817', '2820', '2821', '2824', '2827', '2835', '2837', '2839', '2842', '2850', '2851', '2852', '2854', '2857', '2860', '2862', '2864', '2866', '2867', '2872', '2873', '2874', '2877', '2880', '2884', '2886', '2887', '2888', '2890', '2891', '2896', '2897', '2899', '2900', '2905', '2906', '2907', '2908', '2909', '2910', '2911', '2912', '2913', '2915', '2916', '2917', '2919', '2920', '2922', '2924', '2925', '2926', '2927', '2928', '2929', '2931', '2933', '2935', '2940', '2943', '2944', '2945', '2946', '2948', '2949', '2952', '2953', '2956', '2957', '2960', '2961', '2963', '2966', '2967', '2969', '2974', '2977', '2978', '2981', '2983', '2985', '2986', '2987', '2988', '2993', '2994', '2997', '3000', '3004', '3010', '3012', '3014', '3022', '3023', '3025', '3026', '3028', '3029', '3030', '3031', '3033', '3036', '3037', '3040', '3042', '3044', '3045', '3047', '3052', '3054', '3055', '3056'], ['2723', '2727', '2728', '2730', '2732', '2733', '2735', '2736', '2737', '2739', '2742', '2744', '2747', '2748', '2751', '2753', '2758', '2762', '2765', '2767', '2768', '2770', '2771', '2772', '2778', '2779', '2780', '2781', '2783', '2784', '2785', '2787', '2790', '2791', '2793', '2795', '2796', '2797', '2799', '2801', '2802', '2803', '2804', '2811', '2812', '2816', '2818', '2819', '2823', '2825', '2828', '2829', '2830', '2831', '2832', '2833', '2834', '2836', '2840', '2841', '2843', '2844', '2845', '2846', '2847', '2848', '2849', '2853', '2855', '2856', '2858', '2859', '2861', '2863', '2865', '2868', '2870', '2871', '2875', '2876', '2878', '2879', '2881', '2882', '2889', '2893', '2894', '2895', '2898', '2901', '2902', '2904', '2914', '2918', '2921', '2923', '2930', '2932', '2934', '2936', '2937', '2939', '2942', '2947', '2950', '2951', '2954', '2955', '2958', '2959', '2965', '2970', '2972', '2973', '2975', '2980', '2982', '2984', '2989', '2990', '2991', '2992', '2995', '2996', '2998', '3002', '3006', '3007', '3008', '3009', '3013', '3015', '3016', '3017', '3018', '3021', '3024', '3027', '3032', '3035', '3038', '3039', '3041', '3043', '3046', '3048', '3049', '3050', '3051', '3053']]

# Plot the communities
plot_communities(sub_G, kl_communities)

_images/ffb0a8543213f194a247e1e96d0548228d74dda643a6bb838b8590804f5382f9.png

5.4 Link Prediction#

Link prediction aims to predict missing or future connections between nodes in a graph.
The idea behind link prediction is that nodes with similar properties or shared connections are more likely to form a link in the future.
It is widely used in social networks, biological networks, recommendation systems, and fraud detection.

Generate all non-edges: A non-edge is a pair of nodes that are not connected.

# Generate all non-edges (pairs of nodes without an edge between them)
non_edges = list(nx.non_edges(sub_G))
print(f"Number of non-edges: {len(non_edges)}")

Number of non-edges: 42166

5.4.1 Jaccard Coefficient#

The Jaccard Coefficient measures the similarity between two nodes based on their common neighbors. The Jaccard Coefficient \(J(u, v)\) between two nodes \(u\) and \(v\) is defined as:

\(J(u, v) = \frac{|N(u) \cap N(v)|}{|N(u) \cup N(v)|}\)

Where:

\( N(u) \) is the set of neighbors of node \( u \),
\( N(v) \) is the set of neighbors of node \( v \),
\( |N(u) \cap N(v)| \) is the size of the intersection of the neighbor sets of \( u \) and \( v \),
\( |N(u) \cup N(v)| \) is the size of the union of the neighbor sets of \( u \) and \( v \).
Example: J(A, D)

A - B - C
    |
E - D - F

\(N(A) = \set{B}\)
\(N(D) = \set{B, E, F}\)
\(|N(A) \cap N(D)|\) = 1
\(|N(A) \cup N(D)|\) = 3
\(J(A, D) = 1/3\)

Calculating JC using nx

# Create G_example
G_example = nx.Graph()
edges = [('A', 'B'), ('B', 'C'), ('B', 'D'), ('D', 'E'), ('D', 'F')]
G_example.add_edges_from(edges)
# Calculate Jaccard coefficient between node 'A' and node 'D'
jaccard_coefficient = list(nx.jaccard_coefficient(G_example, [('A', 'D')]))
print(jaccard_coefficient)

[('A', 'D', 0.3333333333333333)]

Link prediction on social network

# Jaccard Coefficient for non-edges
jaccard_scores = list(nx.jaccard_coefficient(sub_G, non_edges))

# Sort the Jaccard scores in descending order (high scores indicate higher likelihood of a link)
top_predictions = sorted(jaccard_scores, key=lambda x: x[2], reverse=True)[:15] # we can also use pandas to sort the values to a df.
# Display the top 15 predicted links
print("Top 15 Predictions (Jaccard Coefficient):")
for u, v, score in top_predictions:
    print(f"Predicted link: ({u}, {v}) with JC: {score:.2f}")

Top 15 Predictions (Jaccard Coefficient):
Predicted link: (2855, 2811) with JC: 1.00
Predicted link: (2736, 2893) with JC: 1.00
Predicted link: (3006, 2980) with JC: 1.00
Predicted link: (2982, 2972) with JC: 0.80
Predicted link: (2816, 2801) with JC: 0.60
Predicted link: (2851, 3045) with JC: 0.60
Predicted link: (2777, 2880) with JC: 0.59
Predicted link: (2929, 2977) with JC: 0.58
Predicted link: (2816, 3032) with JC: 0.57
Predicted link: (2919, 2750) with JC: 0.57
Predicted link: (2777, 2750) with JC: 0.57
Predicted link: (2755, 2919) with JC: 0.56
Predicted link: (2997, 3028) with JC: 0.56
Predicted link: (2927, 2877) with JC: 0.55
Predicted link: (2920, 2800) with JC: 0.55

Visualise the top 15 predicted links by Jaccard Coefficient, you can use other solution to plot all edges by Jaccard Coefficient.

plt.figure(figsize=(10, 8))
pos = nx.spring_layout(sub_G, k=0.5)
nx.draw_networkx_nodes(sub_G, pos, node_size=8, node_color='skyblue', alpha=0.6)
nx.draw_networkx_edges(sub_G, pos, edge_color='gray', alpha=0.5, width=0.1)
nx.draw_networkx_labels(sub_G, pos, font_size=3, font_color="black")
# Highlight the top 15 edges with the highest Jaccard scores
highlighted_edges = [(u, v) for u, v, score in top_predictions]
# Draw the highlighted edges with a different color (e.g., red) and thickness
nx.draw_networkx_edges(sub_G, pos, edgelist=highlighted_edges, edge_color='r', width=0.8, style='dashed')
plt.title("Top 15 Predicted links by Jaccard Coefficient")
plt.axis("off")
plt.show()

_images/8721f11a36c0f77525ecc001a90212396ff8f4eed2b745cbcb45cfd286220494.png

5.4.2 Common neighbours#

In link prediction, common neighbors is a simple and effective method used to predict the likelihood of an edge (link) forming between two nodes in a network. It is based on the idea that two nodes are more likely to be connected if they share many common neighbors.

# Step 1: Calculate common neighbors and assign scores (e.g., the frequency of common neighbours)
predictions = []
for node_a in sub_G.nodes():
    for node_b in sub_G.nodes():
        if node_a != node_b and not sub_G.has_edge(node_a, node_b):
            # Calculate common neighbors between node_a and node_b
            common_neighbors_list = list(nx.common_neighbors(sub_G, node_a, node_b))
            common_neighbors_score = len(common_neighbors_list)

            if common_neighbors_score > 0:  # Only consider pairs with common neighbors
                predictions.append((node_a, node_b, common_neighbors_score))
# Step 2: Sort predictions based on the common neighbors score in descending order
predictions.sort(key=lambda x: x[2], reverse=True)
# Step 3: Output the top predictions (e.g., top 15 most likely links)
top_predictions = predictions[:15]
print("Top 10 link predictions based on common neighbors:")
for pred in top_predictions:
    print(f"Prediction: ({pred[0]}, {pred[1]}) with score: {pred[2]}")

Top 10 link predictions based on common neighbors:
Prediction: (2927, 2877) with score: 42
Prediction: (2877, 2927) with score: 42
Prediction: (2944, 2986) with score: 41
Prediction: (2986, 2944) with score: 41
Prediction: (2925, 2754) with score: 38
Prediction: (2754, 2925) with score: 38
Prediction: (2928, 2877) with score: 37
Prediction: (2777, 2877) with score: 37
Prediction: (2777, 2750) with score: 37
Prediction: (2877, 2928) with score: 37
Prediction: (2877, 2777) with score: 37
Prediction: (2750, 2777) with score: 37
Prediction: (2960, 3054) with score: 36
Prediction: (2919, 2750) with score: 36
Prediction: (3054, 2960) with score: 36

plt.figure(figsize=(10, 8))
pos = nx.spring_layout(sub_G, k=0.5)
nx.draw_networkx_nodes(sub_G, pos, node_size=8, node_color='skyblue', alpha=0.6)
nx.draw_networkx_edges(sub_G, pos, edge_color='gray', alpha=0.5, width=0.1)
nx.draw_networkx_labels(sub_G, pos, font_size=3, font_color="black")
# Highlight the top 15 edges with the highest Jaccard scores
highlighted_edges = [(u, v) for u, v, score in top_predictions]
# Draw the highlighted edges with a different color (e.g., red) and thickness
nx.draw_networkx_edges(sub_G, pos, edgelist=highlighted_edges, edge_color='r', width=0.8, style='dashed')
plt.title("Top 15 Predicted links by Common Neighbors")
plt.axis("off")
plt.show()

_images/fda2493466c84fc071e9f1db0239ecbe9a54f4499a0db22fce33618d04dd193b.png

5.5 Influence propagation#

Influence Propagation is a model used to simulate how information, behaviors, or influence spread through a network over time. It focuses on diffusion processes in networks and is often used in the context of social influence, viral marketing, or epidemic spread.

5.5.1 Independent Cascade Model (IC Model)#

NDlib is a Python library designed to describe, simulate and study diffusion processes on complex networks.

# ! pip install ndlib

import ndlib.models.ModelConfig as mc
import ndlib.models.epidemics as ep #The IC model is included in epidemics in NDLib as a class and is a widely used model for simulating the propagation or spread of viruses in epidemics.

# Select Independent Cascade Model on Sub_G
ic_model = ep.IndependentCascadesModel(sub_G)

# Set the model configuration
config = mc.Configuration() # Create configuration container
config.add_model_parameter('fraction_infected', 0.01)  # the fraction of initially infected nodes

# Set the edge thresholds (if needed, otherwise this may be omitted in IC model)
threshold = 0.1 # Default propagation probability for all edges
for e in sub_G.edges():
    config.add_edge_configuration("threshold", e, threshold)

# Apply configuration to model
ic_model.set_initial_status(config)

# Run the simulation for specified number of iterations
iterations = ic_model.iteration_bunch(100)

# Calculate the fraction of infected nodes over time
time_steps = list(range(100))
fraction_infected = [it['node_count'][1] / sub_G.number_of_nodes() for it in iterations]

# Infection Curve Visualization
plt.figure(figsize=(10, 5))
plt.plot(time_steps, fraction_infected, marker='o', linestyle='-', color='orange', markersize=2.5, linewidth=1)
plt.title("Infection Spread Over Timestep/iteration (IC Model)")
plt.xlabel("Timestep/iteration")
plt.ylabel("Fraction of Infected Nodes")
plt.show()

_images/daf58d030a47b817c73276b8acdeda7ee2502dab0f95ce5cd5ccfe633f6a36d4.png

# Plot setup
plt.figure(figsize=(6, 4))
pos = nx.spring_layout(sub_G, k=0.2)  # Preserve layout across iterations
cumulative_infected = set()

# Limit to first 10 iterations
for i, iteration in enumerate(iterations[:10]):
    plt.clf()  # Reset canvas

    # Update infections
    current_status = iteration['status']
    new_infected = [n for n, s in current_status.items() if s == 1]
    cumulative_infected.update(new_infected)

    # Create node colors
    node_colors = ['orange' if n in cumulative_infected else 'skyblue' for n in sub_G.nodes()]

    # Plot graph
    nx.draw(sub_G, pos, node_color=node_colors, with_labels=False,
            node_size=50, edge_color='gray', width=0.1, alpha=0.6)

    # Add annotation
    plt.title(f"Iteration: {i+1} | "
             f"New infected: {len(new_infected)} | Total infected: {len(cumulative_infected)}\n"
             f"({len(cumulative_infected)/sub_G.number_of_nodes():.1%} infected)", fontsize=8)

    plt.axis('off')
    plt.pause(1)  # Second pause between iterations

plt.show()

_images/27068b725f6fc77e1221c5daa4e757094f77ce2629104f80ec3fbd487b8c5b57.png

_images/cff31052ac26c8ef857ab4db4ec66183065cc6d44ca83ae1ea042b83e5aab0d5.png

_images/f94d577fda9861f0b42eb17fa3ae1a5ef3adec0492a2034fe8c3a954a1033edc.png

_images/6a3ec5f29c6ed7f12f20faa18454c927331374cb044023c9c1c0b8294ec9b206.png

_images/cfdf0faf10e6d26bb767e4d0d63be29eac8cbeff5062e8a0181eb4c43b166f2c.png

_images/ead95db09b1d2727b85daeb15d2e0a2c5cc62814b64b2d8a1d281e6512b7aa73.png

_images/43d5ec7d2455a1de987d168241c170ba7af09477e8e65a5d72e23b47c7042127.png

_images/a935f23fb0ebd0df5be3cb869356377a3ea4f6321a91e2201252ed05558ee176.png

_images/36e31eaaaf851076ca79e8e93d4b5ce80d2b3b78ae46d6b8654803336a3ba073.png

_images/fe0922d765d9e6712b7957a68b50b0c1db7a6236662820875664687aa4021cb6.png

5.5.2 Susceptible-Infected-Removed (SIR) model #

The SIR model used in epidemiology to describe how infectious diseases spread through a population.

It divides the population into three parts:

Susceptible (S): Individuals who are not yet infected but can contract the disease.
Infected (I): Individuals who are currently infected and can spread the disease to susceptible individuals.
Removed (R): Individuals who have either recovered (and gained immunity) or died, meaning they no longer participate in disease transmission.

Key Parameters:

\(\beta\) (beta): Infection probability
\(\gamma\) (gamma): Recovery probability

# Set SIR Model
sir_model = ep.SIRModel(sub_G)

# 2. Configure parameters
config = mc.Configuration()
config.add_model_parameter('beta', 0.4)  # Infection probability
config.add_model_parameter('gamma', 0.2)  # Recovery probability

# Option 1: Initial infected nodes (fraction)
config.add_model_parameter("fraction_infected", 0.01)

# Option 2: Manually set initial infected nodes
# initial_infected = [1, 5, 10]  # Specify initial infected nodes with index
# config.add_model_initial_configuration("Infected", initial_infected)

# Model configuration set
sir_model.set_initial_status(config)

# Run simulation
iterations = sir_model.iteration_bunch(50)

# Process results
susceptible = []
infected = []
removed = []

for iteration in iterations:
    susceptible.append(iteration['node_count'][0]/sub_G.number_of_nodes())
    infected.append(iteration['node_count'][1]/sub_G.number_of_nodes())
    removed.append(iteration['node_count'][2]/sub_G.number_of_nodes())

# Plot curve of SIR
plt.figure(figsize=(8, 5))
plt.plot(susceptible, label='Susceptible', color='skyblue')
plt.plot(infected, label='Infected', color='orange')
plt.plot(removed, label='Removed', color='green')
plt.title("SIR dynamics")
plt.xlabel("Timestep/iteration")
plt.ylabel("Fraction of Population")
plt.legend()
plt.show()

_images/3b408fb40db218e943c766d7c3d34b0ddfdb87f32688cfc3e9242a457b34434b.png

pos = nx.spring_layout(sub_G, k=0.2)  # Calculate once for consistent visualization
# Create figure
plt.figure(figsize=(6, 4))

for i, iteration in enumerate(iterations[:10]):
    plt.clf()  # Clear previous frame

    # Get current node states
    statuses = iteration['status']
    infected = [n for n, s in statuses.items() if s == 1]
    removed = [n for n, s in statuses.items() if s == 2]

    # Create cmap
    node_colors = []
    for node in sub_G.nodes():
        if node in infected:
            node_colors.append('orange')    # Currently infected
        elif node in removed:
            node_colors.append('green')  # Removed/recovered
        else:
            node_colors.append('skyblue')  # Susceptible

    # Plot graph
    nx.draw(sub_G, pos, node_color=node_colors, with_labels=False,
            node_size=50, edge_color='grey', width=0.1, alpha=0.5)

    from matplotlib.lines import Line2D
    legend_elements = [
            Line2D([0], [0], marker='o', color='w', label=f'Susceptible: {len(susceptible)}',
               markerfacecolor='skyblue', markersize=8, linestyle='None'),
        Line2D([0], [0], marker='o', color='w', label=f'Infected: {len(infected)}',
               markerfacecolor='orange', markersize=8, linestyle='None'),
        Line2D([0], [0], marker='o', color='w', label=f'Removed: {len(removed)}',
               markerfacecolor='green', markersize=8, linestyle='None')
    ]

    plt.legend(handles=legend_elements, loc='upper right',
           title=f"Iteration {i+1}", frameon=False)

    plt.title(f"SIR Model - Timestep/iteration {i+1}")
    plt.axis('off')
    plt.pause(1)  # 1 second pause between frames

plt.show()