Chapter 2 Analysis

2.1 Dataset

library(tidyverse)
library(readxl)
library(lubridate)

dat <- read_excel("./_data/VDOT_GIS_v3.xlsx") %>%
  filter(latitude>0) %>%
  mutate(month = month(dtRecorded),
         year = year(dtRecorded),
         semester = semester(dtRecorded),
         quater = quarter(dtRecorded),
         index = (((year-2013)*12) + month)-9,
         index_sem = (((year-2013)*2) + semester)-1,
         index_q = (((year-2013)*4) + quater)-3,
         out_bound = ifelse(latitude>48|latitude<20|longitude<(-123)|longitude>(-80),1,0))

covars <- read.csv("./_data/VDOT_PatientLevelData_2_27_18.csv", stringsAsFactors=FALSE) %>%
  dplyr::select(incomea, incommo, dpndntb, marij, drugs, age, latino, race, sex, cborn, edu, ADH_A, ADH_AS, ADH_ASP, ADHR_A, patientID, C_endstat, PI_Hisp, PI_IDU, PI_IDUyr)

dat.n <- dat %>%
  dplyr::select(patientID, longitude, latitude) %>%
  nest(-patientID) %>%
  mutate(n = map(data, ~nrow(.)),
         uniq = map(data, ~nrow(unique(.)))) %>%
  filter(n>2 & uniq>2) # individuals with enought sample size

2.2 Example single Participant

2.2.1 Hierarchical Clustering

library(factoextra)
source("./_data/vdot_helpers.R")

a <- dat.n$data[dat.n$patientID==100][[1]]

a %>%
  ggplot() +
  geom_point(aes(longitude, latitude), size = 3, alpha = .5) +
  theme_bw()

n.clust(a, method = "silhouette", full=T)

## $`Optimal k`
## [1] 2
## 
## $`Plot optimization`

## 
## $`Cluster Plot`

2.2.2 Spatial Clustering

library(sp)
library(rgdal)
library(geosphere)

n.clust.sp(a, full=T)

## $K
## [1] 13
## 
## $`Cluster Plot`

n.clust.sp(a, full=T, d = 10000)

## $K
## [1] 6
## 
## $`Cluster Plot`

n.clust.sp(a, full=T, d = 200000)

## $K
## [1] 3
## 
## $`Cluster Plot`