Data management / R intro

Download R source file
            
              #R Orientation 
#Author: Jonathan H. Morgan
#Based in Part on Jake Fisher's introduction to R: https://dnac.ssri.duke.edu/r-labs/2017/01_data_management.php
#9 May 2018

###################################
#   STARTING FROM A CLEAR SLATE   #
###################################

#Remove: Removes objects from memory
rm(list = ls())

#Garbage Collection: Frees up memory, but preserves variables created in previous steps
gc()

#######################################
#   INSTALLING AND LOADING PACKAGES   #
#######################################

#R is a modular language. 
#The intuition is that you have in memory only what you need to perform the analysis tasks specified in the script. 
#Consequently, we need to load the packages we will be using during our analyses each time we run a new instance of R.

#Installing a Package
    #Installing a package multiple times can result in R being unable to read the package files.
    #Consequently, if you are uncertain whether a particular package is installed on you machine,
    #use the search window in the Packages tab to check.
    install.packages("readr")

#Reading a Package
library(readr)        #Import csv and other delimited files
library(haven)        #Import SPSS, SAS, or Stata files
library(magrittr)     #Supports pipe (%>%) commands that allow you to perform multiple operations with one statement
library(dplyr)        #Manipulate data
library(tidyr)        #Additional functions for manipulating data
library(ggplot2)      #Visualizing data
library(statnet)      #Network Analysis Software
library(ggnetwork)    #Network Visualization

##################################
#   DATA AND OBJECT TYPES IN R   #
##################################

# R is an object-oriented language, which puts it somewhere between statistical
# computing languages such as SAS, Stata, or SPSS, and object-oriented programming
# languages such as Python or Java.  
# We will focus mainly on using R for statistical computing, but 
# I will demonstrate one instance where writing a simple function is quite useful for importing large data sets.

#DATA TYPES
    #Logical:  TRUE or FALSE
    #Integer:  1, 2, 3, 4, ...
    #Numeric:  1.2, 3.5, 5.5, 0, -1
    #Complex:  1 + 2i (imaginary numbers)
    #Character:  "Jon", "1.2" 
    #Raw:  A mixture of types in the same cell: "Jon" 1 2 2.5 -1
    #Function: Essentially, conditional statements or transformations you want to apply to multiple cases 
    #that utilize operations from Base R or packages that you load.

#When reading in data, R will, by default, treat columns with different data types as different types of objects. 
#There are a few instances where this can be problematic. 
#For example, R tends to treat a column consisting of character variables as a factor, 
#essentially treating it as a categorical varaible when you may simply want a list of names.
#We can avoid these problems if we are mindful of the data types in our data,
#and specify the data type when importing our data in R.
#We can also "coerce" or transform a variable from one type to another. 
#We discuss both methods in this orientation. 

#OBJECT TYPES: Vectors, Lists, and Factors Oh My!
    #Vectors
    #Lists
    #Factors
    #Arrays
    #Matrices
    #Data Frames
    #Functions

#Vectors: A vector is a sequence of data elements of the same basic type.

c(2, 3, 5) 
c(TRUE, FALSE, TRUE, FALSE, FALSE) 
c("aa", "bb", "cc", "dd", "ee") 

#Lists: A list is a generic vector containing other objects.

#For example, the following variable x is a list containing copies of three members n, c, l, and a numeric value 3.
 n = c(2, 3, 5) 
 c = c("aa", "bb", "cc", "dd", "ee") 
 l = c(TRUE, FALSE, TRUE, FALSE, FALSE) 
 x = list(n, c, l, 3)   # x contains copies of n, s, b and the number 3
 
    #Slicing a list: We retrieve a list slice with the single square bracket "[]" operator. 
    #The following is a slice containing the second member of x, which is a copy of c.
    x[2]
    
    #Modifying a list
    x[[2]][1] = "ta" #We are manipulating the list directly, indicated by the double brackets around the 2,
                     #the 1 first element of the second member of the list.
    x[2]
    
#Factors: A vector of integer values with a corresponding set of character values to use when the factor is displayed.
  #Factors are R's way of representing categorical variables.
  
    #Creating an example factor
    data = c(1,2,2,3,1,2,3,3,1,2,3,3,1)
    factor = factor(data)  #We are specifying that 1, 2, and 3 correspond to levels, similar to SAS's class statement.
    factor
    
    #Creating Labels for Levels: 1 2 3
    factor = factor(factor,labels=c("I","II","III"))
    factor
    
    #When importing data, R will specify varibles that it thinks have levels as factors. 
    #This is problematic because R is now treating the variable as a catgorical variable, 
    #and thus will not perform many basic operations.
    
#Arrays: A multidimensional rectangular data object. 
    #"Rectangular" refers to the fact that each row is the same length, and likewise for each column.

    Three_D_Array <- array(
        1:24,                                    #24 rows for each dimension
        dim = c(4, 3, 2),                        #3 dimensions consisting of 4, 3, and 2 objects
        dimnames = list(
              c("one", "two", "three", "four"),
              c("ein", "zwei", "drei"),
              c("un", "deux")
        )
      )
    Three_D_Array  #Enlish Numbers by German Numbers by French Numbers
    
#Matrix: A collection of data elements arranged in a two-dimensional rectangular layout.
    #A matrix is a special case of an array, the 2D version.
    
    Matrix <- matrix(
      1:12,                #Creating cell values
      nrow = 4,            #Specifying the number of rows, ncol = 3 works the same
      dimnames = list(
        c("one", "two", "three", "four"),    #Specifying the rows and columns
        c("ein", "zwei", "drei")
        )
      )
    Matrix
    
#Data Frame:  A list of vectors of equal length.
    #A data frame is a special case of a matrix, 
    #one where we have specified that the data elments in each column are the same type.

    #Data frames are R's counterpart to a classic statistical package's data set.
    #The top line of the table is a header, and contains the column names. 
    #Each horizontal line after the header denotes a data row, which begins with the name of the row, 
    #and then followed by the actual data. 
    #Each data member of a row is called a cell.
    
    n = c(2, 3, 5) 
    c = c("aa", "bb", "cc") 
    l = c(TRUE, FALSE, TRUE) 
    data_frame = data.frame(n, c, l)       #In R forums, df is often used to refer to a data frame.
    data_frame
    
######################
#   IMPORTING DATA   #
######################

#Getting and Setting Your Work Directory
    #It's important to know where you are saving the data.
    #By default, R will save your data to the highest level of your user directory.
    
    #You can determine where R is saving your data by using the following command:
    getwd()
    
    #We can set a working directory which is quite useful because we, then, do not have specify 
    #the file location of eah our data sets when we import them.
    #You can even synchronize your work directory with an online directory.
    #We can set our work directory by using the following command:
    setwd("C:/Users/Jonathan H Morgan/Desktop/SN&H 2018")  # Note: forward slashes

#Importing data into R
    #There are numerous functions and packages for importing data into R. I am going to priamrily discuss "readr" 
    #because this package is capable of importing multiple data types, and is capable of importing large data 
    #sets (e.g., 87 GB).
    #For importing SPSS, SAS, and Stata files directly, we recommend using the "haven" package.
    #Documentaion for Haven: https://cran.r-project.org/web/packages/haven/haven.pdf
    
    #R does provide a GUI based option, but this is not optimal for large data sets
    AHS_Base=read.csv(file.choose(),header=TRUE)

    #Reading the CSV where readr is inferring the data type based on the first 1000 rows of data
    AHS_Base <- read_csv ('C:/Users/Jonathan H Morgan/Desktop/SN&H 2018/ahs_wpvar.csv',
                          col_names = TRUE)
    
    #Useful functionality when importing very large data sets by subsets
    
    f <- function(x, pos) subset(x, x[[27]] == 2)   #Subsetting by gender to isolate female respondents
                                                    #I am using column's index number because this notation
                                                    #works whether the file has a header or not.
    
    AHS_Base <- read_csv_chunked("C:/Users/Jonathan H Morgan/Desktop/SN&H 2018/ahs_wpvar.csv", 
                                     col_names = TRUE, 
                                     DataFrameCallback$new(f),
                                     chunk_size = 10000,
                                     progress=TRUE
    )
    
    #Transforming Grade and Sex into factors
        #Specifying a vector that specifies which variables I want to transform
        cols <- c("sex", "grade")
    
    AHS_Base %<>%
        mutate_each_(funs(factor(.)),cols)
    
    #Confirming that sex and grade now have levels
    str(AHS_Base)
  
##############################
#   DATA MANAGEMENT BASICS   #
##############################
    
#The Basic Grammar of Data Management in R
    #Selecting
    #Arranging
    #Mutating
    #Filtering
    #Renaming
    #Gathering
    #Summarizing
    #Separating
    #Making Distinct
    #Joining
    
#Selecting: "Selecting" always refers to selecting the columns you want.
    AHS_Edges <- AHS_Base %>%
        select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade)
    
#Arranging: "Arranging" reorder rows with respect to columns. 
    AHS_Edges <- AHS_Base %>%
      select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>%  #Using a second pipe to chain commands
      arrange(ego_nid, sex)                  #Arraning the rows with respect to ego ID and gender

#Mutating:  "Mutating" refers to creating a new variable based on operations peformed on another variable.
    #Mutating is admittedly the strangest function name in the R Tidyverse, but it refers to the idea that 
    #a new variable is the result of a transformation of an old one.
    AHS_Edges <- AHS_Base %>%
      select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>% 
      arrange(ego_nid, sex) %>% 
      mutate(Female = ifelse(sex == 2, 1, ifelse(sex != 2, 0, 0)))
    

#Filtering: "Filtering" refers to filtering by rows (e.g., choosing only 7th grade girls in this case).
    AHS_Edges <- AHS_Base %>%
      select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>% 
      arrange(ego_nid, sex) %>% 
      mutate(Female = ifelse(sex == 2, 1, ifelse(sex != 2, 0, 0))) %>%
      filter (grade == "7" & Female == 1)  #Double == comes from set notation if and only if
    
#Renaming: "Renaming" refers to relabeling column names.
    AHS_Edges <- AHS_Base %>%
      select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>%  
      arrange(ego_nid, sex) %>%                   
      mutate(Female = ifelse(sex == 2, 1, ifelse(sex != 2, 0, 0))) %>%
      filter (grade == "7" & Female == 1)  %>%
      rename( id = `ego_nid`,
              gender = `sex`)
    
#Gathering:  "Gathering" refers to gathering columns to transform a wide data set into a long one.
    AHS_Edges <- AHS_Base %>%
      select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, grade, sex) %>%  
      arrange(ego_nid, sex) %>%                   
      mutate(Female = ifelse(sex == 2, 1, ifelse(sex != 2, 0, 0))) %>%
      filter (grade == "7" & Female == 1)  %>%
      rename( id = `ego_nid`,
              gender = `sex`) %>%
      gather(Alter_Label, Target, mfnid_1:mfnid_5, ffnid_1:ffnid_5, na.rm = TRUE)
    
#Summarizing:  "Summarizing" refers to generating summary statitics for a given variable.
    #In this case, we are going to calculate the average number of friends boys and girls have
    
gc()
    
    #Reading in the data to calcualte separate gender means
    AHS_Base <- read_csv ('C:/Users/Jonathan H Morgan/Desktop/SN&H 2018/ahs_wpvar.csv',
                          col_names = TRUE)
    
    AHS_Edges <- AHS_Base %>%
      select(ego_nid, mfnid_1:mfnid_5, ffnid_1:ffnid_5, commcnt, sex) %>% 
      gather(Alter_Label, Target, mfnid_1:mfnid_5, ffnid_1:ffnid_5, na.rm = TRUE) %>% 
      arrange(ego_nid, sex) %>% 
      filter (Target != 99999)   #Eliminating 99999 values
    
    #Generating Summary Statistics
    Gender_Mean <- group_by(AHS_Edges, ego_nid, sex, commcnt) %>%   #Group by ego ID to create a count of alters
                  filter (commcnt == 7 & sex != 0)  %>%             #Examining community 7's school network, and dropping 0s
                  summarise(count = n()) %>%                        #Creating a count of each students alters
                  group_by (sex) %>%                                #Grouping by gender to generate seaparate averages
                  summarise (Gender_Mean = mean(count))             #Generating male and female averages 

gc()
    
#Separating: "Seperating" refers to splitting delimited values in one column into multiple columns
    #Separating is very useful when dealing with delimited items in text data.
    #For example, Qulatrics output for questions where respondents can makes multiple responses
    #has each response to the given question separated by commas in one column.
    #The separate function combined with gather can be quite useful to splitting responses,
    #and then grouping them by each responsdent.
    
    #Simulating data where the output is a string
    ID = c("Jim", "Molly", "Jaemin") 
    Male_Friends = c("Jon Jaemin Joe Jim", "Jim Mudit Marcus", "Jim Peter Chris Marcus") 
    Female_Friends = c("MC Molly Liann", "Crystal Molly Liann", "MC Molly Crystal") 
    data_frame = data.frame(ID, Male_Friends, Female_Friends)       #In R forums, df is often used to refer to a data frame.
    data_frame
    
    #Converting varaibles into character variables to avoid potential problems with gathering 
    #and spearating data.
    data_frame %<>%
      mutate_if(is.factor,as.character)
    
    #This data is a mess, lets fix it
    Edges <- data_frame %>%
      select (ID, Male_Friends, Female_Friends) %>%
      separate(Male_Friends, c("MF_1", "MF_2", "MF_3", "MF_4"), " ") %>%  #Separating each element separated by
                                                                          #a space in the male friends into  its
                                                                          #own column
      separate(Female_Friends, c("FF_1", "FF_2", "FF_3"), " ") %>%        #Repeating this step for female friends
      gather(Alter_Label, Target, MF_1:MF_4, FF_1:FF_3, na.rm = TRUE) %>% #Gathering all the variables to create
                                                                          #an edgelist 
      select (ID, Target)                                                 #Dropping Alter_Label
    
    #We have got the data into something we can use, but character IDs can be problematic
    #Let make unique numeric IDs for all the nodes
    
#Distinct: Eliminates all duplicate values 
  Nodes <- Edges %>%
    gather(Variable_Label, Sender, ID, Target, na.rm = TRUE)%>%  #Gathering ID and Target into one list
    mutate(ID = Sender) %>%                                       #Creating Node Labels for later
    select (ID)  %>%                                              #Dropping the other variables
    distinct(ID) %>%                                              #Isolating unique cases                           
    (add_rownames) %>%                                            #Getting the rownames to create sequential IDs
    rename (Sender_ID = rowname)%>%                               #Renaming rowname to Sender    
    mutate(Sender_ID = as.numeric(Sender_ID))                     #Converting rowname into a numeric variable
  
#Joing:  "Joing" refers to merging data sets using key variable.
    #There are several kinds of joins. We are going to do left and right joins in this case.
    #To learn more about joins see: ttp://www.rpubs.com/williamsurles/293454
  
  #We now want to merge our numeric IDs, Sender, with our edgelist with the ID variable
  Edges <- Edges %>%
    left_join(Nodes, by = c("ID"))
  
  #Renaming to merge Nodes with Target to get Taret_ID
  Nodes <- Nodes %>%
    rename( Target_ID = `Sender_ID`,
            Target = `ID`)
  
  #Merging Numeric IDs for the alters or targets
  Edges <- Edges %>%
    right_join(Nodes, by = c("Target"))
    
  #Final Formatting
  Edges <- Edges %>%
      select(Sender_ID, Target_ID) %>%
      rename ( Target = `Target_ID`,
               Sender = `Sender_ID`)
  
################################################################
#   VISUALIZING OUR SIMULATED NETWORK: PREPARATION FOR DAY 2   #
################################################################
  
  #Step 1: Formatting Sender and Target Variables to Construct a Statnet Network Object
  Edges [,1]=as.character(Edges[,1])
  Edges [,2]=as.character(Edges[,2])
  
  #Step 2: Creating a Network Object
  #Note, this is a directed graph. So, we specify that in the network object now. 
  #The specification of the graph as either directed or undirected is important because it impacts fundamentally how we interpret the relationships described by the graph.
  AHS_Network=network(Edges,matrix.type="edgelist",directed=TRUE) 
  
  #Creating a label vertex to assign to the network
  Label <- as.vector(Nodes$Target)
  
  #Step 3: Assigning Attributes to Vertices from our nodelist
  set.vertex.attribute(AHS_Network,"Label",Label)
  
  #Step 5: Visualizing the Network
  AHS_Network
  summary(AHS_Network)                                #Get numerical summaries of the network
  
  set.seed(12345)
  ggnetwork(AHS_Network) %>%
    ggplot(aes(x = x, y = y, xend = xend, yend = yend)) + 
    geom_edges(color = "lightgray") +
    geom_nodelabel_repel (label = Label) +            #For networks with fewer nodes, we might want to label
    theme_blank() + 
    geom_density_2d()