Diffusion Simulations

Download R source file
            
              # Diffusion simulation models in R
# Author: Jake Fisher

# This lab will focus on how to simulate a diffusion process in R.  We will
# cover:
# - Setting up a diffusion simulation
# - Changing different parameter settings
# - Presenting the results

# Following best practices, we begin by setting up the workspace
rm(list = ls())
setwd("C:/Users/fishe/Box Sync/Home Folder jcf26/SNH2017/Instructor dropbox")
library(statnet)
library(tidyverse)
library(magrittr)
library(ggnetwork)

# Dedicated diffusion packages
library(EpiModel)
library(netdiffuseR)

# Typically, the purpose of a diffusion simulation is to illustrate the effects
# of a new diffusion process, or a new type of network structure, on the
# ultimate results of a diffusion process.  As such diffusion simulations are 
# highly variable -- after all, the point for each one is to show something new!

# However, despite the variability, most diffusion simulations build on a
# common set of building blocks:
# - an underlying network structure,
# - a transmission process, and
# - a running timer.
# In essence, we are defining an algorithm, or a set of rules, for how people
# get "infected" with a social contagion, and looking at how those rules perform
# under differnt conditions.

# To illustrate this, I will show a few examples of diffusion models.  These are
# just an introduction to what you can do; the sky is the limit!

##### Example 1: Disease epidemics ####
# The most straightforward and common use for a diffusion simulation is to
# simulate an epidemic spreading through a population.  Some examples of this
# include:
# Moody, James. 2002. "The Importance of Relationship Timing for Diffusion." 
#      Social Forces 81(1):25-56.
# Merli, M.Giovanna, James Moody, Joshua Mendelsohn, and Robin Gauthier. 2015.
#      "Sexual Mixing in Shanghai: Are Heterosexual Contact Patterns Compatible 
#      With an HIV/AIDS Epidemic?" Demography 52(3):919-42.

# In the example here, we will simulate an epidemic spreading through a random
# network.  This roughly approximates a "random mixing" scenario, where people
# interact randomly.

# To simulate random mixing, we construct the following rules:
# 1. Set up a world with a fixed number of people and a small fraction of 
#    initial infections, arranged randomly as a network
# 2. Pick an edge at random
# 3. If the edge is discordant (a susceptible-infected connection), flip a coin
#    to determine whether the non-infected person gets infected
# 4. Repeat steps 1 and 2 until everyone is infected, or until a certain number
#    of steps has passed

# First, in our simple example, let's set our parameters ahead of time
n.people <- 100
p.infection <- 0.5
pct.starting.infected <- 0.01
max.time <- 5000
contact.rate <- 0.05 / 2  # prob. of contact between any 2 people in the network

### Step 1: Set up world ###
# Create a random graph, where edges are placed randomly.  This is called a
# Bernoulli or an Erdos-Renyi random graph
set.seed(919)
net <- rgraph(n = n.people, tprob = contact.rate) %>%
  symmetrize %>%  # Make symmetric -- doesn't matter who is connected to who
  network(matrix.type = "adjacency")  # convert to statnet

# Chose a subset of people who are infected initially
infected <- sample(
  x = c(T, F),      # people can be infected (T) or susceptible (F)
  size = n.people,  # create a vector that is n.people long
  replace = T,      # with replacement, so that >1 person can be infected
  prob = c(pct.starting.infected, 1 - pct.starting.infected)
  )

### Step 2: Choose an edge ###
# For each step, we're going to choose an edge at random, and then, if the edge
# is discordant, flip a coin to determine whether the susceptible person gets
# infected.

# First, create an edgelist...
el <- as.edgelist(net) %>%
  as.data.frame %>%
  set_names(value = c("from", "to")) %>%
  
  # ... attach the values of infected...
  mutate(from.infected = infected[from], to.infected = infected[to],
         
         # ... and create a discordant variable
         discordant = (from.infected != to.infected))

# Next, choose an edge at random
random.edge <- sample(nrow(el), size = 1)

# Check if the edge is discordant
el[random.edge, "discordant"]  # it's not, so we do nothing.

# For the example, I'm going to speed this up by choosing a discordant edge
discordant.edge <- sample(which(el$discordant), size = 1)

### Step 3: Flip a coin to see if the person gets infected ###

# Now, flip a coin to see if the uninfected person gets infected
el[discordant.edge, ]  # Person 62 is the suceptible, but we will want to be
                       # able to determine that without looking manually

# A little tricky indexing to pull out the ID of the susceptible person...
who.susceptible <- with(
  el[discordant.edge, ],
  c(from, to)[!c(from.infected, to.infected)]
  )

# Flip the coin to determine if infection spreads (it does)
(infected[who.susceptible] <- sample(c(T, F), size = 1, 
                                    prob = c(p.infection, 1 - p.infection)))

### Step 4: Repeat ###
# To repeat this process, we actually embed steps 1 and 2 in a loop.

# Set up a list with the output
infections <- vector(length = max.time, mode = "list")

# Save what we already did as the first iteration
infections[[1]] <- infected

# Quick aside -- what did we create?
head(infections)

# Drop the "from.infected", "to.infected", and "discordant" columns from el,
# because they'll actually change with every iteration
el %<>% select(-from.infected, -to.infected, -discordant)

# Next, run the loop
set.seed(27708)
for (t in 2:max.time) {
  infections[[t]] <- infections[[t - 1]]
  
  # Pick an edge at random
  random.edge <- sample(nrow(el), size = 1)
  
  # If the edge is discordant, flip a coin to decide if the infection spreads
  if (with(el[random.edge, ], 
           infections[[t]][from] != infections[[t]][to])) {
    
    who.susceptible <- with(
      el[random.edge, ],
      c(from, to)[!c(infections[[t]][from], infections[[t]][to])]
      )
    
    infections[[t]][who.susceptible] <- sample(
      c(T, F), 
      size = 1, 
      prob = c(p.infection, 1 - p.infection)
    )
  }
}


# Now we have a list of who was infected at what time point.  Let's combine
# that into a data.frame, so we can work with it more easily
(results <- infections %>%
  lapply(FUN = as.data.frame) %>%
  lapply(FUN = set_names, value = "infected") %>%
  bind_rows(.id = "t") %>%
  mutate(id = rep(1:network.size(net), times = max.time),
         t = as.integer(t)) %>%
  tbl_df)

# This dataset is the raw results of our simulation, but it's usually easier to
# look at a summary.  Let's look at the number of people infected over time
infections.by.time <- results %>%
  group_by(t) %>%
  summarize(n.infections = sum(infected)) %>%
  mutate_each(funs(as.numeric), t, n.infections)

# Aside: in R, there are many ways to solve the same problem.  We could have
# gone directly from the raw data to the summaries using the apply functions:
# infections.by.time <- data.frame(
#   t = 1:max.time,
#   n.infected = sapply(infections, sum)
# ) 

# Plotting this relationship shows the 
qplot(data = infections.by.time, x = t, y = n.infections, geom = "line")

# Or, alternatively, we could look at whether people who are more central
# are infected sooner
time.to.infection <- results %>%
  group_by(id) %>%
  summarize(time.infected = min(t[infected])) %>%
  arrange(id) %>%
  mutate(indegc = degree(net))

ggplot(time.to.infection, aes(x = indegc, y = time.infected)) + 
  geom_point() + 
  geom_smooth(method = "lm")

# For fun, let's plot a few slices of the network in time
set.seed(330)
net.layout <- ggnetwork(net) %>% 
  rename(id = vertex.names)

net.layout.by.time <- split(results, f = results$t) %>%
  lapply(FUN = right_join, y = net.layout, by = "id") %>% 
  bind_rows

net.layout.by.time %>% 
  filter(t %in% c(1, 100, 250, 500, 750, 1000)) %>%
  ggplot(aes(xend = xend, yend = yend, x = x, y = y)) + 
  geom_edges(color = "lightgray") +
  geom_nodes(aes(color = infected)) + 
  facet_wrap(~ t) + 
  theme_blank()

# We could animate this with gganimate... but it doesn't work on R v. 3.4.  See
# https://raw.githubusercontent.com/jalapic/rmeetup_examples/master/ggnetwork_gganimate.R
# for example code.

# The statnet team developed a new package, called EpiModel, to simplify running
# epidemic diffusion models.  Tutorials are online here:
# http://www.epimodel.org/, and we'll walk through one to illustrate.

# Set the parameters ahead of time
param <- param.icm(
  inf.prob = 0.2,  # probability of contracting an infection in a step
  act.rate = 0.25  # speed with which people interact
  )
init <- init.icm(
  s.num = 500,  # number of people who are infected initially
  i.num = 1     # number of people who are susceptible initially
  )
control <- control.icm(
  type = "SI",  # type of model, vs., e.g., SIR or SIS
  nsims = 10,   # number of simulations to run
  nsteps = 300  # number of steps per simulation
  )

# Run the model.  ICM means individual contact model (vs. a deterministic model,
# where the results are determined by a set of differential equations)
set.seed(919)
mod <- icm(param, init, control)

# Now we can look at some summaries of the model
# Look at the state of the model at a particular time slice
summary(mod, at = 125)  

# Look at a dataset with the state of the model at a given time
mod %>%
  as.data.frame(mod = "mean") %>%
  tbl_df

# Get a plot of the number of infections by time
plot(mod)

# We can customize the plot, showing the individual simulations, for example
plot(mod, y = "i.num", sim.lines = TRUE, mean.smooth = FALSE, 
     qnts.smooth = FALSE)

# The package is quite flexible.  It can incorporate demographic changes,
# different network structures, and even custom functions for each of the 
# components of the model (like different mortality schedules).  Check the 
# website for more examples of how to do this.

##### Example 2: Diffusion of innovations #####
# The second example is the diffusion of innovations.  The process by which
# innovations spread is less well-understood.  We usually simulate it with a 
# process that's very similar to diseases diffusing.

# In this case, we will focus on a "complex contagion", meaning something you
# have to hear about from multiple people before you will adopt it.

# Setup
# 1. Construct a small world network
# 2. Choose one person at random, and set him/her and his/her neighbors as 
#    initial adopters
# 3. Infect all the people who have more than tau neighbors infected
# 4. Set the people who were infected at the previous round as "recovered"
# 5. Repeat steps 

### Set parameters in advance ###
n <- 50
tau <- 2 / 6
max.time <- 50

### Step 1: Construct network ###
# Construct a small world network
set.seed(919)
sw.net <- igraph::sample_smallworld(1, n, 2, p = .2) %>%
  intergraph::asNetwork(.) 

### Step 2: Choose a person & neighbors at random as an initial adopter ###
# Set up vector to indicate adoption
adopters <- rep(F, n)

# Choose a person at random
initial.adopter <- sample(seq_len(n), size = 1)

# Get the list of people they're attached to
initial.neighbors <- get.neighborhood(sw.net, initial.adopter)

# Set them all as "adopters"
adopters[c(initial.adopter, initial.neighbors)] <- T

### Step 3: infect all people who have more than tau neighbors infected ###

# Let's look at one person, person 18, to see how this works
ego.extract(sw.net, ego = 20, neighborhood = "out")

# Person 18 hasn't adopted
adopters[20]

# About half of their neighbors have adopted
adopters[c(18, 19, 22, 25)]
mean(adopters[c(18, 19, 22, 25)])

# To decide whether the person adopts, we test whether the fraction of
# adopters is greater than tau
mean(adopters[c(18, 19, 22, 25)]) >= tau  # adoption!

# We can update everyone simultaneously using matrix multiplication
adj.mat <- sw.net[, ]
diag(adj.mat) <- 0  # set the diagonal to 0, b/c people don't weight themselves

# We could take the sum...
adj.mat %*% adopters

#... or the percentage
(adj.mat.rn <- adj.mat / rowSums(adj.mat))
adj.mat.rn %*% adopters

# And then we calculate the people who are above our threshold
(adj.mat.rn %*% adopters) >= tau

# Note that this actually allows people to abandon the innovation if enough of
# of their neighbors are not adopters.  For now we don't want that to happen, 
# so we'll only test people who are not yet adopters
ifelse(adopters, TRUE, ((adj.mat.rn %*% adopters) >= tau))

### Step 4: Repeat ###
# Again, we can take care of this by wrapping it in a loop
adopt <- vector(mode = "list", length = max.time)
adopt[[1]] <- adopters

for (t in 2:max.time) {
  adopt[[t]] <- ifelse(adopters, TRUE, ((adj.mat.rn %*% adopt[[t - 1]]) >= tau))
}

# Note that again we get the characteristic S-shaped curve:
data.frame(
  t = 1:max.time,
  n.adopt = sapply(adopt, sum)
) %>%
  ggplot(aes(x = t, y = n.adopt)) + 
  geom_line()

# Let's plot a few frames

set.seed(330)
sw.net.layout <- ggnetwork(sw.net) %>% 
  rename(id = vertex.names)

sw.net.layout.by.time <- adopt %>%
  lapply(FUN = as.data.frame) %>% 
  lapply(FUN = set_names, value = "adopter") %>% 
  lapply(FUN = mutate, id = 1:n) %>%
  lapply(FUN = right_join, y = sw.net.layout, by = "id") %>% 
  bind_rows(.id = "t") %>% 
  mutate(t = as.integer(t))

sw.net.layout.by.time %>% 
  filter(t < 10) %>% 
  ggplot(aes(xend = xend, yend = yend, x = x, y = y)) + 
  geom_edges(color = "lightgray") +
  geom_nodes(aes(color = adopter)) + 
  facet_wrap(~ t) + 
  theme_blank()

# There's a new package for analyzing the diffusion of innovations.  It focuses
# mainly on creating survival models for diffusion data (and has a different
# data structure) so we won't cover it here.  But it's well worth checking out
# the vignette:
vignette("analyzing-medical-innovation-data")


##### Example 3: Averaging ideas #####
# A third common approach use for diffusion simulation is to suggest how
# people's beliefs change through interaction with others.  A longstanding
# assumption is that people take the weighted average of their friends.

# Set the parameters in advance
self.weight <- 0.2
max.time <- 5

# Load the data and save a few values for convenience
load("faux_add_health_with_faux_attitude.Rdata")

n.people <- network.size(add.health)  
attitude <- (add.health %v% "faux.attitude")
adj.mat <- add.health[, ]  # save the adjacency matrix to its own object

# Plotting the network shows that attitude values are randomly distributed in
# this network
set.seed(27708)
net.layout <- ggnetwork(add.health)

p <- ggplot(net.layout, aes(x = x, y = y, xend = xend, yend = yend)) + 
  geom_edges(color = "lightgray", 
             arrow = arrow(length = unit(6, "pt"), type = "closed")) + 
  theme_blank() + 
  scale_color_distiller(palette = "Spectral")

p + geom_nodes(aes(color = faux.attitude), size = 4)

# In general, we assume that people update their attitude using the average of
# their neighbors' opinions, and their own opinion.  For example, person 3 names
# 4 friends: people 6, 29, 34, and 40
ego.extract(add.health, ego = 3, neighborhood = "out")

# Those people have 4, 2, 3, and 2 as their attitudes
attitude[c(6, 29, 34, 40)]

# Their mean value is 2.75
mean(attitude[c(6, 29, 34, 40)])

# Then we assume that people take the weighted average of their own attitudes
# and their friends' attitudes
self.weight * attitude[3] + (1 - self.weight) * mean(attitude[c(6, 29, 34, 40)])

# We could repeat this for everyone individually, but this can actually be done
# by matrix multiplication, which makes it much easier to program.

# First, we have to row-normalize (set rows so that they sum to 1 by dividing
# each row by its sum) the adjacency matrix
diag(adj.mat) <- 0  # set the diagonal to 0

(adj.mat.rn <- adj.mat / rowSums(adj.mat))  # note the missing values!
                                            # Isolates => dividing by 0

# NaN does not play nicely with other values, so swap it for 0 in the row-
# normalized matrix.
adj.mat.rn[which(is.nan(adj.mat.rn), arr.ind = T)] <- 0

# In general, we assume that isolates do not change under this model.
# Mechanically, we can either remove them from the analysis or we can set their
# self-weights to 1.

# Identify outdegree isolates
isos <- (degree(add.health, cmode == "outdegree") == 0)

# Set self-weights to 1:
self.weight.vec <- rep(self.weight, times = n.people)
self.weight.vec[isos] <- 1

# Remove isolates
self.weight.no.iso <- self.weight.vec[!isos]
adj.mat.no.iso <- adj.mat.rn[!isos, !isos]
attitude.no.iso <- attitude[!isos]

# Now run the simulation...

# ... with self-weights of isolates = 1
I <- diag(n.people)  # identity matrix
SW <- diag(self.weight.vec)  # diagonal matrix with self-weights on diagonal

(adj.mat.rn %*% (I - SW) + SW) %*% attitude

# ... removing the isolates
I.no.iso <- diag(sum(!isos))
SW.no.iso <- diag(self.weight.vec[!isos])

(adj.mat.no.iso %*% (I.no.iso - SW.no.iso) + SW.no.iso) %*% attitude.no.iso

# Similarly to the other simulations, we can repeatedly perform this procedure
# using a for loop:
sim.attitude <- vector(mode = "list", length = max.time)
sim.attitude[[1]] <- attitude

for (t in 2:max.time) {
  sim.attitude[[t]] <- (adj.mat.rn %*% (I - SW) + SW) %*% sim.attitude[[t - 1]]
}

# Now let's plot the values by time, to show how the diffusion process unfolds

# Convert each list element to a data frame that ggplot can use, & combine them
layout.with.attitudes <- sim.attitude %>%
  lapply(FUN = as.data.frame) %>%
  lapply(FUN = set_names, value = "attitude.value") %>%
  lapply(FUN = mutate, vertex.names = seq_len(n.people)) %>%
  lapply(FUN = right_join, y = net.layout, by = "vertex.names") %>%
  bind_rows(.id = "iteration")

# Plot it, with facets to show how this changes over each iteration
p %+% 
  layout.with.attitudes +
  geom_nodes(aes(color = attitude.value), size = 4) +
  facet_wrap(~ iteration)
  
# A package that's under development, latentnetDiffusion, takes care of the
# matrix multiplication and reshaping of the data for you
devtools::install_github("jcfisher/latentnetDiffusion")
library(latentnetDiffusion)
library(latentnet)

# For example, to run the same simulation as above, use:
(sim.attitudes.2 <- degroot(W = add.health, Y = as.matrix(attitude), 
                            all.iter = T, self.weight = 0.8) %>%
  tidyDegroot(id = seq_len(n.people)) )

# Or, we can fit a latent space model, and simulate the diffusion process over
# the latent space:
# fit <- latentnet::ergmm(add.health ~ euclidean(d = 2))  # takes a minute!
# save(fit, file = "ergmm_2d_fit.Rdata")
load("ergmm_2d_fit.Rdata")

(ls.attitudes <- ergmmDegroot(fit, Y = as.matrix(attitude), simulate = F, 
                             all.iter = T) %>%
  tidyDegrootList(id = seq_len(n.people)))


# Plot it, to show the difference
layout.with.ls.attitudes <- ls.attitudes %>%
  rename(vertex.names = id) %>% 
  {split(., f = .$iteration)} %>% 
  lapply(FUN = right_join, y = net.layout, by = "vertex.names") %>%
  bind_rows(.id = "iteration")

p %+% 
  layout.with.ls.attitudes +
  geom_nodes(aes(color = pred.value), size = 4) +
  facet_wrap(~ iteration)
  
# The same model, simulated:
set.seed(919)
(ls.attitudes.sim <- ergmmDegroot(fit, Y = as.matrix(attitude), simulate = T, 
                                  all.iter = T) %>%
    tidyDegrootList(id = seq_len(n.people)))
  
# Plotted again
layout.with.ls.attitudes.sim <- ls.attitudes.sim %>%
  rename(vertex.names = id) %>% 
  {split(., f = .$iteration)} %>% 
  lapply(FUN = right_join, y = net.layout, by = "vertex.names") %>%
  bind_rows(.id = "iteration")

p %+% 
  layout.with.ls.attitudes.sim +
  geom_nodes(aes(color = pred.value), size = 4) +
  facet_wrap(~ iteration)