Chapter 9 graphs for public health I

9.1 데이터 그림 준비

실습 자료는 2017년 근로환경조사 자료를 통해 실습할 것입니다.. 자료는 안전보건공단, 근로환경조사 원시자료 사이트 (http://kosha.or.kr/kosha/data/primitiveData.do) 에서 신청할 수 있습니다.. 시각화는 표와 그래프 둘을 다룰 것이고, 표는 보건학에서 주로 사용하는 표1,2,3과 그래프는 막대 그래프부터 시계열 자료까지 나타내도록 하겠습니다. 실습에 필요한 라이브러리를 불러오겠습니다.

library(devtools)

## Error in get(genname, envir = envir) : 
##   객체 'testthat_print'를 찾을 수 없습니다

library(Table1)
library(ggplot2)
library(tidyverse)
library(htmlTable)
library(haven)
library(gmodels)

저번 시간에 이어 근로환경조사 자료를 불러오고 온콜과 우울에 대한 데이터를 불러오겠습니다. 데이터 변형에서 실습했던 코드입니다.

a=read_sas("data/kwcs5th.sas7bdat") # 근로환경조사 5차
# 변수생성: 온콜 여부/빈도, 우울, 연령, 성별, 교육, 근무시간, 종사상지위, 고용형태, 소득#
a0<-a %>%
    select(Q35, AGE, TSEX, TEF1, Q22_1, Q05, Q06, EF11, EF12, Q69,Q62_1_8, Q35, # 기존 실습
           Q16_1, Q26_7, Q26_8, Q49_15, Q49_2, Q49_1) # 새로 추가된 실습과제 
# 변수 구획 정하기 ------

Wh_breaks <- c(-Inf, 35, 45, 55, 65, Inf)
Wh_labels <- c('<35','35-44','45-54','55-64','>=65')
inc_break <- c(-Inf, 100, 200, 300, 400, Inf)
inc_label <- c('<100', '100-199', '200-299', '300-399', '>400')

# 데이터 step ------
a1 <-a0 %>%
   filter(AGE <65 ) %>% 
   filter(!is.na(Q22_1)) %>% 
   filter(!TEF1    == 9) %>%
   filter(!Q69     == 9) %>%
   filter(!Q62_1_8 == 9) %>% 

   mutate(oncall    = Q35) %>%
   mutate(oncallgp  = ifelse(oncall %in% c(1, 2, 3), "on call", "non-on call")) %>%
   mutate(oncallgp3 = ifelse(oncall %in% c(1,2,3), "several times a month", 
                      ifelse(oncall %in% c(4), "rarely", "none"))) %>%
   mutate(oncallgp3 = factor(oncallgp3, 
                         levels=c("none", "rarely", "several times a month")))%>%
   mutate(agegp     = ifelse(AGE <30, '<30', 
                             ifelse(AGE <40, '30-49', 
                                    ifelse(AGE <50, '40-49', 
                                           ifelse(AGE < 60, '50-59', '≥60'))))) %>%
   mutate(Wh=cut(Q22_1, breaks=Wh_breaks, include.lowest=TRUE, right=FALSE, 
                        labels=Wh_labels)) %>% 
   mutate(Wh=structure(Wh, label='Working hours')) %>%
   mutate(Gender=factor(TSEX, levels=c(1,2), labels=c('Men', 'Women') )) %>%
   mutate(Education=factor(TEF1, levels=c(1,2,3,4), 
                        labels=(c('Primary', 'Middle', 'High', 'University')))) %>%
   mutate(Statusw=ifelse(Q05 %in% c(1,2), 'Self employer', 
                         ifelse(Q05 %in% c(3), 'Paid worker', 
                                'Family workers and others'))) %>%
   mutate(inc1=ifelse(EF11 <10000, EF11, ifelse(EF12<10, EF12*100-50, NA) )) %>%
   mutate(inc=cut(inc1, breaks=inc_break, labels=inc_label)) %>%
   mutate(job_st = factor(Q69, levels = c(1:4), 
                  labels = c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied"))) %>%
   mutate(job_st = structure(job_st, label = 'Job Satisfaction')) %>%
   mutate(depression = Q62_1_8)%>%
   mutate(depression = structure(factor(depression, levels=c(1, 2), 
                                     labels=c("Depression", "Non depression"))))

9.2 실습 자료 시각화

이 데이터를 이용해서 다음과 같은 그래프를 그려보려고 합니다. 저번 시간에 oncall 의 횟수가 늘어나면 우울할 오즈비가 증가하는 것을 표로 나타내었는데요, 그래프를 보면 주 45시간 미만이면서 자영업과 가족종사자에서는 그렇지 않은 반면에, 장시간 근로자에서는 oncall 이 늘어날 수록 우울해 지는 것을 볼 수 있습니다. 왜 그럴까요? 라는 질문이 저절로 나오게 됩니다. 이처럼 그래프를 통해 변화되는 양상을 보여주는 것은 데이터의 속성을 탐구하게 만드는 원동력이 되고는 합니다. 따라서 데이터를 탐구할 때 그래프를 이용해 소통하는 작업을 자주 하는 것이 좋습니다.

그림1

위 그래프를 그리는 과정은 youtube에 올려 놓았습니다.

동영상	링크
oncall에 따른 우울감	https://youtu.be/w1305N2yH_g

** 동영상 **

우선 oncallgp3는 명목변수여서 그래프를 그리기 어렵습니다. 따라서 해당하는 연속변수를 만들어서 그래프를 그리겠습니다. 또한 장시간 근무를 45시간을 기준으로 (주 40시간이 기준이고, 하루 한시간 정도 추가 근무를 하면 45시간이 됩니다.)

 a2<- a1 %>%
  mutate(oncall3 = case_when(
    oncall %in% c(1,2, 3) ~ 3, # several times a month 
    oncall %in% c(4)      ~ 2, # rarely
    TRUE ~ 1                   # none
  )) %>%
  mutate(lwh = ifelse(Q22_1 > 45, 2, 1)) %>%
  mutate(lwhf = factor(lwh, levels = c(1,2), 
                         labels = c("Working hours less than 45", 
                                    "Working hours more than 45")))

oncall별 우울감이 얼마나 있는지 계산해 보겠습니다. 우선 전체 4만여명 중에 1006명이 우울감을 호소하고 있습니다.

a2 %>%
  group_by(depression) %>%
  count()

## # A tibble: 2 x 2
## # Groups:   depression [2]
##   depression         n
##   <fct>          <int>
## 1 Depression      1006
## 2 Non depression 40583

그렇다면 group_by()에 oncall3를 넣고 count에 depression을 넣어 온콜에 따른 우울감 분율을 계산해 보겠습니다.

a2 %>%
   group_by(oncall3) %>%
   count(depression)

## # A tibble: 6 x 3
## # Groups:   oncall3 [3]
##   oncall3 depression         n
##     <dbl> <fct>          <int>
## 1       1 Depression       773
## 2       1 Non depression 33244
## 3       2 Depression       164
## 4       2 Non depression  5637
## 5       3 Depression        69
## 6       3 Non depression  1702

이때 그래프를 그리기 위해 필요한 것은 Depression은 분율 또는 prevalance입니다. group별 우울감 여부 (n)을 group별 총 명수 (sum(n))으로 나누면 1번 집단은 773/(773+33244)의 크기로 우울감 유병율이 나타나게 됩니다.

a2 %>%
   group_by(oncall3) %>%
   count(depression) %>%
   mutate(prob = n/sum(n)*100)

## # A tibble: 6 x 4
## # Groups:   oncall3 [3]
##   oncall3 depression         n  prob
##     <dbl> <fct>          <int> <dbl>
## 1       1 Depression       773  2.27
## 2       1 Non depression 33244 97.7 
## 3       2 Depression       164  2.83
## 4       2 Non depression  5637 97.2 
## 5       3 Depression        69  3.90
## 6       3 Non depression  1702 96.1

다만 그래프를 그릴때 사용되는 것은 우울감의 유병율이므로 filter 를 통해 depression 변수의 변수값이 Depression일 때만을 사용하도록 하겠습니다.

a2 %>%
   group_by(oncall3) %>%
   count(depression) %>%
   mutate(prob = n/sum(n)*100) %>%
   filter(depression == 'Depression')

## # A tibble: 3 x 4
## # Groups:   oncall3 [3]
##   oncall3 depression     n  prob
##     <dbl> <fct>      <int> <dbl>
## 1       1 Depression   773  2.27
## 2       2 Depression   164  2.83
## 3       3 Depression    69  3.90

이제 oncall3의 1, 2, 3을 x-축에, prob(우울감 유병률)을 y-축에 위치시키고 그림을 그리겠습니다. 그림은 ggplot2 라이브러리를 이용하겠습니다. 이것이 가장 기본적인 그래프 그리기입니다. aes안에는 x, y축을 담당하고 데이터의 기본 가정에 해당하는 것을 적어 놓습니다. geom_* 뒤에는 표현 방식에 대한 부분을 넣습니다. geom_line은 선차트, geom_point 산점차트를 그린다는 것입니다.

a2 %>%
   group_by(oncall3) %>%
   count(depression) %>%
   mutate(prob = n/sum(n)*100) %>%
   filter(depression == 'Depression') %>%
   ggplot(aes(x = oncall3, y = prob)) +
   geom_point()+
   geom_line()

우리는 이제 데이터에 내포된 의미를 찾기 위해 고용형태에 따른 oncall과 우울간의 관계를 그려보고자 합니다. 따라서 group_by( )에 고용형태 변수를 넣어 같은 그림을 그려봅니다. 상기 그래프에서 color, shape, linetype을 고용형태에 따라 다르게 해달라고 표신한 부분을 확인해 주세요.

a2 %>%
   group_by(oncall3, Statusw) %>%
   count(depression) %>%
   mutate(prob = n/sum(n)*100) %>%
   filter(depression == 'Depression') %>%
   ggplot(aes(x = oncall3, y = prob)) +
   geom_point(aes(color = Statusw, shape = Statusw), size =3) +
   geom_line(aes(color = Statusw, linetype = Statusw))

이번에는 그래프를 둘로 나눌 것입니다. 하나는 45시간 이하 근무자, 다른 하나는 45시간 초과 근무자로 나눌 것입니다. 그래프는 facet_wrap()를 이용하고 그 안에 lwhf~.는 근무시간에 따라 나누어 그리라는 뜻입니다. color부분은 중복되니 맨 처음에 group = Statusw 를 color = Statusw로 바꾸겠습니다.

a2 %>% group_by(oncall3, Statusw, lwhf) %>%
  count(depression) %>%
  mutate(prob = n/sum(n)*100) %>%
  filter(depression == 'Depression') %>%
  ggplot(aes(x= oncall3, y = prob, color = Statusw)) +
  geom_point(aes(shape = Statusw), size =3) +
  geom_line(aes(linetype = Statusw)) +
  theme_minimal() +
  facet_wrap(lwhf~.)

x 축을 none, rarely, several times로 바꾸고, y축은 % 단위로 바꾸며, 두 그래프 사이를 2칸 띄우는 방식을 도입해 그래프를 마무리 하도록 하겠습니다.

a2 %>% group_by(oncall3, Statusw, lwhf) %>%
  count(depression) %>%
  mutate(prob = n/sum(n)*100) %>%
  filter(depression == 'Depression') %>%
  ggplot(aes(x= oncall3, y = prob, color = Statusw)) +
  geom_point(aes(shape = Statusw), size =3) +
  geom_line(aes(linetype = Statusw)) +
  theme_minimal() +
  facet_wrap(lwhf~.) +
  labs(color = 'Employment status',
       shape = 'Employment status',
       linetype = 'Employment status') +
  xlab("on call works") + ylab("Prevalance of Depressive symptoms") +
  scale_x_continuous(breaks = c(1, 2, 3), 
                     labels = c('1' = 'none', 
                                '2' = 'rarely',
                                '3' = 'several \ntimes')) +
  scale_y_continuous(labels = function(x) paste0(x, "%"))+
  theme(panel.spacing = unit(2, "lines"))

9.3 시각화 과제 1

이번에는 고객 대면 근로자가 화난 고객을 상대함에 있어서, 상사와 동료의 지지가 어떠한 보호 효과를 보이는지 그래프를 그려보도록 하겠습니다. 저번 실습자료에 이번 과제에서 사용될 변수인 감정노동, 동료의지지, 상사의지지를 추가하겠습니다.

변수	설명
Q26_7	서비스직, 대면 근로자
Q26_9	화난 고객 대응 근로자
Q49_15	일하면서 감정숨김
Q49_2	상사의 지지적 문화
Q49_1	동료의 지지적 문화

em <- a %>%
   select(Q35, AGE, TSEX, TEF1, Q22_1, Q05, Q06, EF11, EF12, Q69,Q62_1_8, Q35, # 기존 실습
           Q16_1, Q26_7, Q26_8, Q49_15, Q49_2, Q49_1, KQ50) %>%
   filter(Q16_1 >=1, Q26_7 <7, Q26_8 <=7, Q49_15 <=3,  
    KQ50<9, Q49_2 <7, Q49_1<7)  %>%               # 새로 추가된 실습 과제 변수
   mutate(agegp     = ifelse(AGE <30, '<30', 
                             ifelse(AGE <40, '30-49', 
                                    ifelse(AGE <50, '40-49', 
                                           ifelse(AGE < 60, '50-59', '≥60'))))) %>%
   mutate(Wh=cut(Q22_1, breaks=Wh_breaks, include.lowest=TRUE, right=FALSE, 
                        labels=Wh_labels)) %>% 
   mutate(Wh=structure(Wh, label='Working hours')) %>%
   mutate(Gender=factor(TSEX, levels=c(1,2), labels=c('Men', 'Women') )) %>%
   mutate(Education=factor(TEF1, levels=c(1,2,3,4), 
                        labels=(c('Primary', 'Middle', 'High', 'University')))) %>%
   mutate(Statusw=ifelse(Q05 %in% c(1,2), 'Self employer', 
                         ifelse(Q05 %in% c(3), 'Paid worker', 
                                'Family workers and others'))) %>%
   mutate(inc1=ifelse(EF11 <10000, EF11, ifelse(EF12<10, EF12*100-50, NA) )) %>%
   mutate(inc=cut(inc1, breaks=inc_break, labels=inc_label)) %>%
   mutate(job_st = factor(Q69, levels = c(1:4), 
                  labels = c("Very satisfied", "Satisfied", "Unsatisfied", "Very unsatisfied"))) %>%
   mutate(job_st = structure(job_st, label = 'Job Satisfaction')) %>%
   mutate(depression = Q62_1_8)%>%
   mutate(depression = structure(factor(depression, levels=c(1, 2), 
                                     labels=c("Depression", "Non depression")))) %>%
  mutate(servieworker=Q26_7) %>%
  
  mutate(angergp=ifelse(Q26_8 %in% c(1,2,3), 3, 
                        ifelse(Q26_8 %in% c(4, 5), 2, 1))) %>%
  mutate(angergp2=ifelse(Q26_8 %in% c(1,2,3), 'always', 
                         ifelse(Q26_8 %in% c(4, 5), 'sometimes', 'rarely'))) %>%
  mutate(angergp2 = factor (angergp2, levels = c('rarely', 'sometimes', 'always'))) %>%
  mutate(suppressiongp=ifelse(Q49_15 %in% c(1,2,3), 1, 0)) %>%
  mutate(suppressiongp2=ifelse(Q49_15 %in% c(1,2), 'always',
                               ifelse(Q49_15 %in% c(3), 'sometimes', 'rarely'))) %>%
  mutate(support_sup=ifelse(Q49_2 %in% c(1,2), 'always', 
                            ifelse(Q49_2 %in% c(3), 'sometimes', 'rarely')), 
         support_col=ifelse(Q49_1 %in% c(1,2), 'always', 
                            ifelse(Q49_1 %in% c(3),'sometimes', 'rarely')))%>%
  mutate(support_sup=factor(support_sup, levels=c('always', 'sometimes', 'rarely')), 
         support_col=factor(support_col, levels=c('always', 'sometimes', 'rarely'))) %>%
  mutate(support_sup2=ifelse(Q49_2 %in% c(1,2, 3), '> sometimes', 'rarely'), 
         support_col2=ifelse(Q49_1 %in% c(1,2, 3), '> sometimes', 'rarely')) %>%
  mutate(support_sup2=factor(support_sup2, levels=c('> sometimes', 'rarely')), 
         support_col2=factor(support_col2, levels=c('> sometimes', 'rarely')))

상기 자료를 통해 아래의 그래프를 그리시오

9.3.1 과제 1 그림 완성하기

여기에 group_by(…, …, …,)와 ggplot(aes(x=…, y=…, color =…)) 에 알맞은 변수를 넣어 위에 그래프를 완성하시오

em %>% group_by(..., ..., ...) %>%  ## 여기에 변수 넣기
  count(depression) %>%
  mutate(prob=n/sum(n)*100)  %>%
  filter(depression=='Depression') %>%
  ggplot(aes(x=..., y=..., color=...)) + ## 여기에 변수 넣기
  geom_point(aes(shape = support_sup),size=3) +
  geom_line(aes(linetype = support_sup))+
  scale_x_continuous(breaks=c(1, 2, 3), 
                     labels=c('1'='rarely', '2'='sometimes', '3'='always' )) +
  xlab('Engaging complain customers') +
  ylab('Prevalance of depressive symptoms') +
  labs(color    = 'Support from \nSupervisor', 
       shape    = 'Support from \nSupervisor', 
       linetype = 'Support from \nSupervisor') + 
  scale_y_continuous(labels = function(x) paste0(x, "%"))+
  theme_minimal()+
  facet_wrap(Gender~.)+
  theme(panel.spacing = unit(2, "lines"))

9.3.2 과제 2 동료의 지지가 미치는 영향을 그리시오

위와 동일한 가설에서 상상의 지지 대신 동료의 지지가 미치는 영향을 그리시오.

9.3.3 과제 3 자신의 데이터 실습

자신이 생각하는 종속변수와 관심변수 중 동료의 지지 처럼 종속변수에 영향을 미치는 변수를 찾아 그림을 그리세요.

9.4 상호 작용 영향 표현

화난 고객을 상대하는 비율이 높고, 상사의 지지가 낮은 경우가 동시에 발생할 때의 오즈비를 구해보았다. 아래의 코드에서 OR 값 (백터) 또는 ORmatrix 값 (matrix) 를 이용해서 자유롭게 그림을 그려보세요. 몇가지 예시 그림을 아래에 그려 놓겠습니다.

em2 <-em %>% mutate(combine = ifelse(support_sup2 == '> sometimes' & angergp2 == 'rarely', 1, 
                              ifelse(support_sup2 == '> sometimes' & angergp2 == 'sometimes', 2, 
                              ifelse(support_sup2 == '> sometimes' & angergp2 == 'always', 3, 
                              ifelse(support_sup2 == 'rarely' & angergp2 == 'rarely', 4, 
                              ifelse(support_sup2 == 'rarely' & angergp2 == 'sometimes', 5, 6))))))

intfit1 <- glm(data = em2, family = binomial, 
               depression =='Depression' ~ as.factor(combine)
)

exp_OR <- exp(coef(intfit1))
OR<-as.numeric(exp(coef(intfit1)))
OR[1] <-1
ORs <- OR
ORs

## [1] 1.000000 1.279070 2.034867 2.019262 4.785292 8.064103

첫번째 예시는, 3D barplot 으로 plot3D 페키지를 이용했습니다. 아무래도 이것은 액셀을 이용하는 것이 더 편할 것 같네요.

library(plot3D)

ORmatrix<-t(matrix(as.numeric(ORs), nrow =2, ncol=3, byrow = TRUE)  )
rownames(ORmatrix) <- c('always', 'sometimes', 'rarely')
colnames(ORmatrix) <- c('rarely', '>sometimes')

hist3D(x=c(1, 2, 3), y =1:2,z = ORmatrix, scale = FALSE, expand = 0.1,
       bty = "g", phi = 20, theta = -30,
       col = "#0072B2", border = "black", shade = 0.9, ltheta = 90,
       space = 0.15, ticktype = "detailed", d = 2, nticks =3,
       xlab="", ylab="", zlab = "Odds ratio", 
       cex.axis = 1e-9)
text3D(x= 1:3, y=rep(0.5, 3), z=rep(-1, 3), 
       labels = c('always', 'sometimes', 'rarely'), 
       add = TRUE, adj = 0)
text3D(x= 1.5, y=0, z=-1, 
       labels = c('Support from supervisor'), 
       add = TRUE, adj = 0)
text3D(x= c(0.1,0.1), y=c(1.2, 2), z=c(-0, -0), 
       labels = c('rarely', '>sometimes'), 
       add = TRUE, adj = 0)
text3D(x= c(-0.3), y=c(1), z=c(-0.0), 
       labels = c('Engaging \nComplain customers'), 
       add = TRUE, adj = 0)

text3D(x= rep(0.12,4), y=rep(2,4), z=c(2*1:4+2.1), 
       labels = c('2', '4', '6', '8'), 
       add = TRUE, adj = 0)

두번째 예시는 아래와 같습니다. 몇가지 수작업을 통해 선을 그려주는 명령어 및 축에 label을 수작업으로 넣는 방법을 보여주고 있습니다. 실제로 많이 사용되는 방법이므로 익숙해 지시기 바랍니다.

dat <- data.frame(x = 1:6, 
                  y = ORs)
dat %>%
   ggplot(aes(x=x, y =y)) +
   geom_point()+
   geom_line() +
   theme_minimal() +
   theme(axis.text.x=element_blank())+
   annotate(geom = 'text', x= 1:6+0.1, y =0.5, label = rep(c('rarely','sometimes', 'always'),2), size =3) +
   annotate(geom = 'text', x= c(2,5), y =0.0, label = rep(c('complain customers'), each = 2), size=4) +
   annotate(geom = 'text', x= c(2, 5), y =-0.7, label = rep(c('>sometimes', 'rarely'), each = 1), size = 5) +
   annotate(geom = 'text', x= c(3.5), y =-1.2, label = rep(c('Support from superior'), each = 1), size =6) +
   geom_hline(yintercept = 1, color ='black') +
   geom_hline(yintercept = -0.3, color ='grey')+
   geom_segment(aes(x = c(3.5), y = c(-0.3), xend = c(3.5), yend = c(8)), colour = 'grey', linetype=1) + 
   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
    panel.background = element_blank(), axis.line = element_blank()) +
   xlab("") +ylab("Odds Ratio")

9.5 라벨 자료

보건학에서 우선순위를 선정하는 경우를 상상해 보자. 여기에 특정 업종에서 특정 질병이 얼마나 발생하는지에 대한 가상의 자료가 있다. 여기에 두가지 산재발생률(특정업종 근로자에서 질병 발생률 = 질병발생수/특정업종 근로자수)과 산재 비례발생률(특정업종 근로자에서 전체 질병 대비 특정 질병 발생률)이 있습니다. 이것 자체의 의미는 보다는, 특정 업종 산재 발생률이 전체 업종 산재 발생률에 대해 얼마의 크기가 있는가와 특정 업정 특정 산재질병 비례발생률이 전체 업종에 비해 얼마의 크기가 있는지를 구하였습니다. 전자를 claim rate ratio (crr) 이라고 하고, 후자를 proportional claim rate ratio (pcr) 으로 명칭하겠습니다. 참고로 crr과 pcr의 장점은 crr의 경우 대표성이 있고 율의 개념이 들어간 비이므로 우리가 흔히 생각하는 위험도의 개념으로 설명이 가능합니다. 그러나 crr은 건강근로자효과에 매우 취약한데, 예를 들어 특정 집단이 손상으로 미리 사망하여 암이 발생하기 어렵다던가하면 암의 crr이 줄어들게 됩니다. 반대로 pcr은 율의 개념이 상이하여 우리가 아는 위험도의 의미로 해석하기는 어려우나, 위의 경우보다는 건강근로자효과에 보다 안정적일 경우가 있습니다. 예를 들어 각 업종별로 산재신청의 이유가 동일하다면 전체 산재신청중에 특정질병의 수이기 때문에 동일한 의미를 볼수 있습니다. 그러한 의미로 crr과 pcr이 모두 높은 경우를 우리는 좀더 관심을 갖어야 하는 질환으로 생각할 수 있습니다.

이번시간에는 상기 가상의 자료를 통해 업종별 질병별 보건학 우선순위를 그래프로 그려보겠습니다.

관련 동영상	링크
라벨자료	https://youtu.be/eE5FeVPfgsY

라이브러리 불러오기

library(tidyverse)
library(htmlTable)
library(ggplot2)
library(ggrepel)
library(DT)

9.5.1 라벨자료 그래프 실습

qfdat를 불러오고 여기서 crr고 pcr의 그래프를 그려보겠습니다.

qfdat <- readxl:: read_xlsx('data/qfdat1.xlsx')
dat <- qfdat %>% 
  mutate(zx = crr * pcr) %>%
  mutate(names = paste0(industry, "\nX", disease))
              


qfdat %>%
  ggplot(aes(x= crr, y = pcr)) +
  geom_point() +
  theme_classic()+
  xlab("claim rate ratio") +
  ylab("proportional claim rate ratio") #+   ylim(c(-0.5, 6))

x-, y- 축을 log로 치환하여 보도록 하겠습니다. 다만 로그로 하는 경우에는 중심 좌표가 혼란이 올 수 있습니다. 보통 1이 중심이 되는데 (비 이기 때문), log(1) =0이므로 x와 y에 log(1) 값에 대한 기준 선을 추가하겠습니다.

dat %>%
  ggplot(aes(x= log(crr), y = log(pcr))) +
  geom_point() +
  geom_vline(xintercept = c(log(1)), linetype =2, color ='grey') +
  geom_hline(yintercept = c(log(1)), linetype =2, color ='grey')+
  theme_classic()+
  xlab("claim rate ratio") +
  ylab("proportional claim rate ratio") #+   ylim(c(-0.5, 6))

여기에 라벨을 그려보도록 하겠습니다. 너무 많아서 보기가 매우 어렵습니다.

dat %>%
  ggplot(aes(x= log(crr), y = log(pcr))) +
  geom_point() +
  geom_vline(xintercept = c(log(1)), linetype =2, color ='grey') +
  geom_hline(yintercept = c(log(1)), linetype =2, color ='grey')+
  theme_classic()+
  xlab("claim rate ratio") +
  ylab("proportional claim rate ratio") + #+   ylim(c(-0.5, 6)) 
  geom_label_repel(aes(label = names))

pcr이 2가 넘고, crr이 2가 넘는 질환에 대해서만 그려 보도록 하겠습니다. 따라서 기준선도 좀 옮겨 보겠습니다.

dat %>%
  filter(crr >3 & pcr >3) %>%
  ggplot(aes(x= log(crr), y = log(pcr))) +
  geom_point() +
  geom_vline(xintercept = c(log(6)), linetype =2, color ='grey') +
  geom_hline(yintercept = c(log(6)), linetype =2, color ='grey')+
  theme_classic()+
  xlab("claim rate ratio") +
  ylab("proportional claim rate ratio") + #+   ylim(c(-0.5, 6)) 
  geom_label_repel(aes(label = names))

그림과 선이 잘 보이도록, size를 고려하고, 축 범위를 조절하겠습니다.

dat %>%
  filter(crr >3 & pcr >3) %>%
  ggplot(aes(x= log(crr), y = log(pcr))) +
  geom_point() +
  geom_vline(xintercept = c(log(6)), linetype =2, color ='grey') +
  geom_hline(yintercept = c(log(6)), linetype =2, color ='grey')+
  theme_classic()+
  xlab("claim rate ratio") +
  ylab("proportional claim rate ratio") + 
  xlim(c(-1, 7)) + ylim(c(-1, 6)) +
    geom_label_repel(aes(label = names), 
                   fill = NA, # 투명하게 해서 겹쳐도 보이게
                   alpha =1, size = 3, # 작게
                   box.padding = 0.4,  # 분별해서
                   segment.size =0.1,  # 선 
                   force = 2)          # 이것은 무엇일까요?

마지막으로 업종의 크기나 질병의 크기를 표현해 보겠습니다. 그 이유는 아무래도 규모가 큰 경우에 사업을 하는 경우 더 큰 효과를 볼 수 있기때문입니다.

dat %>%
  filter(crr >3 & pcr >3) %>%
  ggplot(aes(x= log(crr), y = log(pcr), 
             color = as.factor(round(sc_ind/3,1)))) + # ind(업종) 규모에 따른 색
   geom_point(aes( size = sc_dz^2), alpha = 0.5) + # dz(질병) 규모에 따른 크기
  geom_vline(xintercept = c(log(6)), linetype =2, color ='grey') +
  geom_hline(yintercept = c(log(6)), linetype =2, color ='grey')+
  theme_classic()+
  xlab("claim rate ratio") +
  ylab("proportional claim rate ratio") + 
  xlim(c(-1, 7)) + ylim(c(-1, 6)) +
    geom_label_repel(aes(label = names), 
                   fill = NA, # 투명하게 해서 겹쳐도 보이게
                   alpha =1, size = 3, # 작게
                   box.padding = 0.4,  # 분별해서
                   segment.size =0.1,  # 선 
                   force = 2)      +    # 이것은 무엇일까요?
  theme(legend.position="bottom")+
  labs(color = "Factory size", size = 'Disease Score')