import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import random
import statistics

# set out the plots from matplotlib display
%matplotlib notebook


# create path variable that is the path to where the adjacency matrix data are located
path = "./datav4.0/Data/1. Network Data/Adjacency Matrices/"
# create a list of villages indexed from 1 to 77 inclusive
villages = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21,  \
            23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, \
            41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, \
            59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77]

# initialize an empty dictionary
Gs = {}

# for loop that iterates over each village index to read it in and create a networkx graph object  
for k in villages:
    # concatenate element in village with path  and file type to get that certain village
    # ex: filename for the first village is: 
    # "./datav4.0/Data/1. Network Data/Adjacency Matrices/adj_allVillageRelationships_vilno_1.csv"
    #noting we need to convert the int to a str to concatenate
    filename = path + "adj_allVillageRelationships_vilno_" + str(k) + ".csv"
    # load in the text with delimiter , (knows where to separate) and call it A
    A = np.loadtxt(filename, delimiter=",")
    # convert A to a networkx Graph object and call it G
    G = nx.to_networkx_graph(A)
    # add the graph to the Gs dictionary with key = village index and value = networkx graph object of that village
    Gs[k] = G


# ADD YOUR CODE HERE
#missing 13, 22
plt.figure(figsize=(10,5))

LCCs = {}
for i in Gs:
    LCCs[i] = nx.induced_subgraph(Gs[i], sorted(nx.connected_components(Gs[i]), key=len, reverse=True)[0])

prop = []
for i in range(len(Gs)):
    prop.append(len(list(LCCs.values())[i].nodes()) / len(list(Gs.values())[i].nodes()))

d = {}
d = dict(zip([str(i) for i in list(Gs.keys())],prop))
d1 = sorted(d, key=d.get, reverse = False)
d2 = sorted(d.values(), reverse = False)
d = dict(zip(d1, d2))


# plt.figure()
plt.scatter(d.keys(), d.values())
plt.xlabel("Index of Village")
plt.ylabel("Proportion of nodes in LCC")
plt.title("Proportion of nodes in LCC vs. Index of Village")
plt.rc('xtick', labelsize=3)


# # ADD YOUR CODE HERE
fig, axes = plt.subplots(nrows=13, ncols=6, figsize=(20,40))
ax = axes.flatten()

for i in range(len(LCCs)):
    nx.draw(LCCs[list(LCCs.keys())[i]], ax=ax[i], node_size=5)
    ax[i].set_axis_off()


plt.figure()
for i in LCCs:
# get rid of 0 degree since we are dealing with the LCC
    freq = nx.degree_histogram(LCCs[i])
    freq.pop(0)
    plt.plot( [i for i in range(1, len(nx.degree_histogram(LCCs[i])))], freq)
    plt.rc('xtick', labelsize=15) 

plt.xlabel("Degree")
plt.ylabel("Degree frequency")
plt.title("Degree distribution of each LCC")

Text(0.5, 1.0, 'Degree distribution of each LCC')


# util functions
def recover(i_nodes, r_nodes, p):
    new_recoveries = []
    for i in i_nodes:
        if random.random() < p:
            new_recoveries.append(i)
    for i in new_recoveries:
        i_nodes.remove(i)
    r_nodes += new_recoveries
    return new_recoveries

def spread(G, s_nodes, i_nodes, p):
    new_infections = []
    sampled = set()
    for i in i_nodes:
        if G.degree(i) > 0:
            sampled.add(random.choice(list(G.neighbors(i))))    
    
    for i in sampled:
        if i in s_nodes and random.random() < p:
                i_nodes.append(i) 
                new_infections.append(i)
    
    # remove all new infected nodes from susceptible nodes  
    for i in new_infections:
        s_nodes.remove(i)
        
    return new_infections

def simulate(G, p_si, p_ir, num_seeds, num_time_steps):
    
    i_nodes = random.sample(list(G.nodes()), num_seeds)
    s_nodes = [node for node in list(G.nodes()) if node not in i_nodes]
    r_nodes = []
    
    num_s_nodes = [len(s_nodes)]
    num_i_nodes = [len(i_nodes)]
    num_r_nodes = [len(r_nodes)]
    
#     print(s_nodes)
    
    while num_time_steps > 0:
        recover(i_nodes, r_nodes, p_ir)
      
        spread(G, s_nodes, i_nodes, p_si)
        
        num_s_nodes.append(len(s_nodes))
        num_i_nodes.append(len(i_nodes))
        num_r_nodes.append(len(r_nodes))
        
        
        
#         if len(i_nodes) == 0:
#             break
        num_time_steps-=1 
        
    return num_s_nodes, num_i_nodes, num_r_nodes

def make_plot(num_nodes, num_s_nodes, num_i_nodes, num_r_nodes, num_time_steps): 
    h1, = plt.plot(np.array(num_s_nodes) / num_nodes)
    h2, = plt.plot(np.array(num_i_nodes) / num_nodes)
    h3, = plt.plot(np.array(num_r_nodes) / num_nodes)
    plt.xlabel("Time")
    plt.ylabel("Fraction of S, I, and R nodes")
    plt.legend([h1,h2,h3], ["S nodes","I nodes","R nodes"], loc="center left") 
    plt.xlim([0, num_time_steps])


p_si = 0.2 # S to I transition probability
p_ir = 0.1 # I to R transition probability

# # ADD YOUR CODE HERE
# figsize=(20,40)
fig, axes = plt.subplots(nrows=2, ncols=2, figsize = (10, 7))
ax = axes.flatten()


four_villages = [1, 31, 61, 77]
num_time_steps = 250

for j in range(300):
    for i in range(len(four_villages)):
        num_nodes = len(LCCs[four_villages[i]].nodes())
        num_s_nodes, num_i_nodes, num_r_nodes = simulate(LCCs[four_villages[i]], p_si, p_ir, num_seeds = 1, num_time_steps = num_time_steps)
    #     make_plot(num_nodes, num_s_nodes, num_i_nodes, num_r_nodes, num_time_steps)
        plt.sca(ax[i])
        h1, = plt.plot(np.array(num_s_nodes) / num_nodes, color = "blue")
        h2, = plt.plot(np.array(num_i_nodes) / num_nodes, color = "red")
        h3, = plt.plot(np.array(num_r_nodes) / num_nodes, color = "green")
        plt.xlabel("Time")
        plt.ylabel("Fraction of S, I, and R nodes")
        message = f"Village {four_villages[i]}"
        plt.title(message)
        plt.legend([h1,h2,h3], ["S nodes","I nodes","R nodes"], loc="center left") 
        plt.xlim([0, num_time_steps])
#         plt.title("Village", str(four_villages[i]))
plt.suptitle("300 SIR Simulations for Villages 1, 31, 61, 77")

Text(0.5, 0.98, '300 SIR Simulations for Villages 1, 31, 61, 77')


# ADD YOUR CODE HERE
villages = [1, 31]
# take the number of infected divded by the number of infected at that time

# new simulate function that returns Rt
def simulate_Rt(G, p_si, p_ir, num_seeds, num_time_steps):
    i_nodes = random.sample(list(G.nodes()), num_seeds)
    s_nodes = [node for node in list(G.nodes()) if node not in i_nodes]
    r_nodes = []
    
    num_s_nodes = [len(s_nodes)]
    num_i_nodes = [len(i_nodes)]
    num_r_nodes = [len(r_nodes)]
    
    Rt = []
#     print(s_nodes)
    t = 0
    while num_time_steps > 0:
        recover(i_nodes, r_nodes, p_ir)
        new_infections = spread(G, s_nodes, i_nodes, p_si)
#         print("tets,",len(new_infections), "infected", num_i_nodes[t])
#         print(new_infections, num_i_nodes[t])
        if num_i_nodes[t] == 0:
            Rt.append(0)
        else:
            Rt.append(len(new_infections)/num_i_nodes[t])
        
        num_s_nodes.append(len(s_nodes))
        num_i_nodes.append(len(i_nodes))
        num_r_nodes.append(len(r_nodes))
        num_time_steps-=1 
        t += 1
        
    return Rt


fig, axes = plt.subplots(nrows=1, ncols=3, figsize = (10,7))
ax = axes.flatten()

mean_village_1 = []
plt.sca(ax[0])
for i in range(1000):
    village1 = simulate_Rt(LCCs[1], p_si, p_ir, num_seeds = 1, num_time_steps = num_time_steps)
    mean_village_1.append(village1)
    plt.plot(village1, color = "lightblue")
plt.plot(np.mean(np.array(mean_village_1), axis=0))
plt.xlabel("Effective Reproductive \nNumber for Village 1 Over Time.\n Line is mean Rt.")
plt.ylabel("Effective Reproductive Number")



mean_village_31 = []
plt.sca(ax[1])
for i in range(1000):
    village31 = simulate_Rt(LCCs[31], p_si, p_ir, num_seeds = 1, num_time_steps = num_time_steps)
    mean_village_31.append(village31)
    plt.plot(village31, color = "lightblue")
plt.plot(np.mean(np.array(mean_village_31), axis=0), color = "red")
plt.xlabel("Effective Reproductive \nNumber for Village 31 Over Time.\n Line is mean Rt.")


plt.sca(ax[2])
plt.plot(np.mean(np.array(mean_village_1), axis=0))
plt.plot(np.mean(np.array(mean_village_31), axis=0), color = "red")
plt.xlabel("Mean Effective Reproductive \nNumber for Village 1 (blue)\n and Village 31 (red) Over Time")

Text(0.5, 0, 'Mean Effective Reproductive \nNumber for Village 1 (blue)\n and Village 31 (red) Over Time')


# new spread that takes into consideration of treatment vs control
def spread_Q6(G, s_nodes, i_nodes, p, group):
    new_infections = []
    sampled = set()
    for i in i_nodes:
        if G.degree(i) > 0:
            sampled.add(random.choice(list(G.neighbors(i))))    
    
    for i in sampled:
        if i in s_nodes and random.random() < group.get(i):
                i_nodes.append(i) 
                new_infections.append(i)
    
    # remove all new infected nodes from susceptible nodes  
    for i in new_infections:
        s_nodes.remove(i)
        
    return new_infections

def simulate_Q6(G, p_si, p_ir, num_seeds, num_time_steps, group):
    
    i_nodes = random.sample(list(G.nodes()), num_seeds)
#     print(i_nodes)
    s_nodes = [node for node in list(G.nodes()) if node not in i_nodes]
    r_nodes = []
    
    total_infected = set(i_nodes)

    
    num_s_nodes = [len(s_nodes)]
    num_i_nodes = [len(i_nodes)]
    num_r_nodes = [len(r_nodes)]
    
#     print(s_nodes)
    
    while num_time_steps > 0:
        recover(i_nodes, r_nodes, p_ir)
      
        new_infected = spread_Q6(G, s_nodes, i_nodes, p_si, group)
        total_infected.update(new_infected)
        
        num_s_nodes.append(len(s_nodes))
        num_i_nodes.append(len(i_nodes))
        num_r_nodes.append(len(r_nodes))
        
        
        
#         if len(i_nodes) == 0:
#             break
        num_time_steps-=1 
    
    return total_infected


# fix some parameters
p_ir = 0.05
p_si_treatment = 0.05
p_si_control = 0.3

# ADD YOUR CODE HERE
total_villages = []
for i in LCCs:
    group = dict(zip(list(LCCs[i].nodes()), random.choices([p_si_treatment,p_si_control], k=len(LCCs[i]))))
    all_infected = simulate_Q6(LCCs[i], p_si, p_ir, num_seeds = 1, num_time_steps = num_time_steps, group = group)
    # getting which group each infected node belonged to
    all_infected_p = collections.Counter([group.get(key) for key in all_infected])
    # total from each group
    all_infected_total = collections.Counter(group.values())
    try: 
        treatment_prop = all_infected_p.get(p_si_treatment)/all_infected_total.get(p_si_treatment)
    except:
        treatment_prop =  0
    try: 
        control_prop = all_infected_p.get(p_si_control)/all_infected_total.get(p_si_control)
    except:
        control_prop =  0
    total_villages.append([treatment_prop, control_prop,control_prop - treatment_prop])
vals = np.mean(np.array(total_villages), axis=0)
print("Average proportion of cases in the treatment groups:", vals[0])
print("Average proportion of cases in the control groups:", vals[1])
print("Average difference in proportion of cases in the control vs treatment groups:", vals[2])

Average proportion of cases in the treatment groups: 0.0930454572422352
Average proportion of cases in the control groups: 0.22984194582342846
Average difference in proportion of cases in the control vs treatment groups: 0.13679648858119328


def simulate_Q7(G, p_si, p_ir, num_seeds, num_time_steps):
    
    i_nodes = random.sample(list(G.nodes()), num_seeds)
    s_nodes = [node for node in list(G.nodes()) if node not in i_nodes]
    r_nodes = []
    
    total_infected = set(i_nodes)

    
    num_s_nodes = [len(s_nodes)]
    num_i_nodes = [len(i_nodes)]
    num_r_nodes = [len(r_nodes)]
    
#     print(s_nodes)
    
    while num_time_steps > 0:
        recover(i_nodes, r_nodes, p_ir)
      
        new_infected = spread(G, s_nodes, i_nodes, p_si)
        total_infected.update(new_infected)
        
        num_s_nodes.append(len(s_nodes))
        num_i_nodes.append(len(i_nodes))
        num_r_nodes.append(len(r_nodes))
        
        
        
#         if len(i_nodes) == 0:
#             break
        num_time_steps-=1 
    
    return total_infected


# fix some parameters
p_ir = 0.05
p_si_treatment = 0.05
p_si_control = 0.3

# ADD YOUR CODE HERE
treatment_villages = []
control_villages = []

for i in LCCs:
    curr_p_si = random.choices([p_si_treatment,p_si_control])[0]
    all_infected = simulate_Q7(LCCs[i], curr_p_si ,\
                            p_ir, num_seeds = 1, num_time_steps = num_time_steps)
    
    if curr_p_si == p_si_treatment:
        treatment_villages.append(len(all_infected)/LCCs[i].number_of_nodes())
    else:
        control_villages.append(len(all_infected)/LCCs[i].number_of_nodes())

print("Average proportion of cases in the treatment villages:", np.mean(treatment_villages))
print("Average proportion of cases in the control villages:", np.mean(control_villages))
print("Average difference in proportion of cases in the control vs treatment villages:", np.mean(control_villages)- np.mean(treatment_villages))

Average proportion of cases in the treatment villages: 0.003566259163982035
Average proportion of cases in the control villages: 0.5420319868506677
Average difference in proportion of cases in the control vs treatment villages: 0.5384657276866857

Introduction to Social and Biological Networks Final Project

Final Project¶

Overview¶

Question 1: Downloading and reading in village data¶

Question 2: Extracting largest connected components¶

Question 3: Visualizing networks and their degree and distributions¶

Question 4: Simulating SIR process¶

Question 5: Reproduction number¶

Question 6: Individual-level randomized controlled vaccine trial¶

Question 7: Cluster-level randomized controlled vaccine trial¶

Introduction to Social and Biological Networks Final Project

BST 267: Introduction to Social and Biological Networks (2022)¶

Final Project¶

Overview¶

Question 1: Downloading and reading in village data¶

Question 2: Extracting largest connected components¶

Question 3: Visualizing networks and their degree and distributions¶

Question 4: Simulating SIR process¶

Question 5: Reproduction number¶

Question 6: Individual-level randomized controlled vaccine trial¶

Question 7: Cluster-level randomized controlled vaccine trial¶