This is an exploratory study of data breaches reported to HHS according to HIPPA

library(tidyverse)
## ── Attaching packages ────────────────────────────────── tidyverse 1.3.1.9000 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data <- read_csv("breach_report_all.csv", skip = 1, 
                 col_types = cols(num_individuals = col_integer()),
        col_names = c("name", "state", "type_of_entity", "num_individuals", "date",
                      "type_of_breach", "point_of_breach", "business","description" ))

data <- data %>% mutate (date = mdy(date)) %>%
  mutate(year = year(date), month = month(date), day = day(date))
data <- data %>% drop_na(num_individuals)%>%
  mutate( type =(str_split(type_of_breach," ", simplify = TRUE)))%>%
  mutate (type_of_breach1 = paste0(type[,1])) %>% 
  mutate(type_of_breach1 = recode(type_of_breach1,
                                  "Loss,"= "Loss",
                                  "Other," ="Unknown",
                                  "Theft,"="Theft",
                                  "NA" = "Unknown",
                                  "Other"="Unknown"))%>%
  select(c(-type))
data <- data %>%
  mutate( type =(str_split(point_of_breach," ", simplify = TRUE)))%>%
  mutate (point_of_breach1 = paste0(type[,1])) %>%
  mutate(point_of_breach1 = recode(point_of_breach1,
                                  "Email,"= "Email",
                                  "Laptop,"="Laptop",
                                  "Other,"="Other",
                                   "Electronic"="Electronic Medical Record",
                                  "Network"="Network Server"))%>%
  select(c(-type))
data %>% drop_na(type_of_entity)%>%
  ggplot(aes(x = as.factor(type_of_entity), y = log(num_individuals))) +
  geom_boxplot()+
  theme_classic()+
  theme(axis.text.x = element_text(angle = -45))+
  labs(y = "number of breach incidents",
    x = element_blank())

data %>% drop_na(type_of_entity)%>%
  ggplot(aes(x = type_of_entity, y = log(num_individuals))) +
  geom_boxplot()+
  geom_jitter()+
  theme_classic()

data %>% drop_na(type_of_entity)%>%
  group_by(type_of_breach1, year) %>%
  summarise(num = n(), total = mean(num_individuals)) %>%
  filter(year > 2015)%>%
  filter(year!=2015)%>%
  filter(type_of_breach1 != "Unknown")%>%
  ggplot()+
  geom_col(aes(x = (num), y= reorder(type_of_breach1, num)))+
  facet_wrap(~year)+
  theme_classic()
## `summarise()` has grouped output by 'type_of_breach1'. You can override using
## the `.groups` argument.

data %>% mutate(month = month(date), year = year(date)) %>%
  group_by(year, month, type_of_breach1) %>% 
  mutate(num= n(), avg = mean(num_individuals)) %>% 
  mutate(date = make_date(year, month, day)) %>%
  ungroup()  %>% 
  filter(2018 < year & year < 2022)%>%
  ggplot(aes(date, (num))) +
  geom_line(size = 1, col = 2)+
  facet_wrap(vars(type_of_breach1))+
  #geom_jitter(aes(y = log(num_individuals)))+
  theme_classic()

data %>% mutate(month = month(date), year = year(date)) %>%
  group_by(year, month, point_of_breach1) %>%
  summarise(num= n(), avg = mean(num_individuals)) %>% 
  mutate(date = make_date(year, month))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 968 × 6
## # Groups:   year, month [160]
##     year month point_of_breach1   num    avg date      
##    <dbl> <dbl> <chr>            <int>  <dbl> <date>    
##  1  2009    10 Network Server       1  1000  2009-10-01
##  2  2009    10 Other                1   501  2009-10-01
##  3  2009    10 Paper/Films          1  1000  2009-10-01
##  4  2009    11 Desktop              5  3675. 2009-11-01
##  5  2009    11 Laptop               4  4261. 2009-11-01
##  6  2009    12 Desktop              2  1323  2009-12-01
##  7  2009    12 Email                1   610  2009-12-01
##  8  2009    12 Other                2  5298  2009-12-01
##  9  2009    12 Paper/Films          1 83000  2009-12-01
## 10  2010     1 Desktop              1  9309  2010-01-01
## # … with 958 more rows
data %>% mutate(month = month(date), year = year(date)) %>%
  group_by(year, month, point_of_breach1) %>%
  mutate(num= n(), total = sum(num_individuals)) %>% 
  mutate(date = make_date(year, month)) %>%
  ungroup()  %>% 
  filter(2015 < year & year < 2022)%>%
  ggplot(aes(date, log(total))) +
  geom_line(size = 1, col = 2)+
  facet_wrap(vars(point_of_breach1))+
  #geom_jitter(aes(y = log(num_individuals)))+
  theme_classic()

bookdown::publish_book()
## Preparing to deploy site...DONE
## Uploading bundle for site: 11570...DONE
## Deploying bundle: 80681 for site: 11570 ...
## [Connect] Building static content...
## [Connect] Launching static content...
## Site successfully deployed to https://bookdown.org/mehdibarati22/my-website/