# 第 83 章 探索性数据分析-大学生职业决策

## 83.1 预备知识

library(tidyverse)

example <-
tibble::tribble(
~name, ~english, ~chinese, ~math, ~sport, ~psy, ~edu,
"A",     133,    100,    102,     56,    89,   89,
"B",     120,    120,     86,     88,    45,   75,
"C",      98,    109,    114,     87,    NA,   84,
"D",     120,     78,    106,     68,    86,   69,
"E",     110,     99,    134,     98,    75,   70,
"F",      NA,    132,    130,     NA,    68,   88
)

example
## # A tibble: 6 × 7
##   name  english chinese  math sport   psy   edu
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A         133     100   102    56    89    89
## 2 B         120     120    86    88    45    75
## 3 C          98     109   114    87    NA    84
## 4 D         120      78   106    68    86    69
## 5 E         110      99   134    98    75    70
## 6 F          NA     132   130    NA    68    88

### 83.1.1 缺失值检查

example %>%
summarise(
na_in_english = sum(is.na(english)),
na_in_chinese = sum(is.na(chinese)),
na_in_math    = sum(is.na(math)),
na_in_sport   = sum(is.na(sport)),
na_in_psy     = sum(is.na(math)),   # tpyo here
na_in_edu     = sum(is.na(edu))
)
## # A tibble: 1 × 6
##   na_in_english na_in_chinese na_in_math na_in_sport na_in_psy na_in_edu
##           <int>         <int>      <int>       <int>     <int>     <int>
## 1             1             0          0           1         0         0

example %>%
summarise(
across(everything(), mean)
)
## Warning: There was 1 warning in summarise().
## ℹ In argument: across(everything(), mean).
## Caused by warning in mean.default():
## ! argument is not numeric or logical: returning NA
## # A tibble: 1 × 7
##    name english chinese  math sport   psy   edu
##   <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1    NA      NA    106.   112    NA    NA  79.2
example %>%
summarise(
across(everything(), function(x) sum(is.na(x)) )
)
## # A tibble: 1 × 7
##    name english chinese  math sport   psy   edu
##   <int>   <int>   <int> <int> <int> <int> <int>
## 1     0       1       0     0     1     1     0

### 83.1.2 数据预处理

• 直接丢弃缺失值所在的行
example %>% drop_na()
## # A tibble: 4 × 7
##   name  english chinese  math sport   psy   edu
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A         133     100   102    56    89    89
## 2 B         120     120    86    88    45    75
## 3 D         120      78   106    68    86    69
## 4 E         110      99   134    98    75    70
• 均值代替缺失值
d <- example %>%
mutate(
across(where(is.numeric), ~ if_else(is.na(.), mean(., na.rm = T), .))
)
d
## # A tibble: 6 × 7
##   name  english chinese  math sport   psy   edu
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A        133      100   102  56    89      89
## 2 B        120      120    86  88    45      75
## 3 C         98      109   114  87    72.6    84
## 4 D        120       78   106  68    86      69
## 5 E        110       99   134  98    75      70
## 6 F        116.     132   130  79.4  68      88
• 计算总分/均值
d %>%
rowwise() %>%
mutate(
total = sum(c_across(-name))
)
## # A tibble: 6 × 8
## # Rowwise:
##   name  english chinese  math sport   psy   edu total
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A        133      100   102  56    89      89  569
## 2 B        120      120    86  88    45      75  534
## 3 C         98      109   114  87    72.6    84  565.
## 4 D        120       78   106  68    86      69  527
## 5 E        110       99   134  98    75      70  586
## 6 F        116.     132   130  79.4  68      88  614.
d %>%
rowwise() %>%
mutate(
mean = mean(c_across(-name))
)
## # A tibble: 6 × 8
## # Rowwise:
##   name  english chinese  math sport   psy   edu  mean
##   <chr>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A        133      100   102  56    89      89  94.8
## 2 B        120      120    86  88    45      75  89
## 3 C         98      109   114  87    72.6    84  94.1
## 4 D        120       78   106  68    86      69  87.8
## 5 E        110       99   134  98    75      70  97.7
## 6 F        116.     132   130  79.4  68      88 102.
• 数据标准化处理
standard <- function(x) {
(x - mean(x)) / sd(x)
}
d %>%
mutate(
across(where(is.numeric), standard)
)
## # A tibble: 6 × 7
##   name  english chinese   math  sport    psy    edu
##   <chr>   <dbl>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
## 1 A       1.44   -0.339 -0.555 -1.54   1.04   1.10
## 2 B       0.326   0.731 -1.44   0.566 -1.75  -0.464
## 3 C      -1.56    0.143  0.111  0.500  0      0.538
## 4 D       0.326  -1.51  -0.333 -0.750  0.852 -1.13
## 5 E      -0.531  -0.392  1.22   1.22   0.153 -1.02
## 6 F       0       1.37   0.999  0     -0.292  0.984

## 83.2 开始

### 83.2.1 文件管理中需要注意的地方

data <- readxl::read_excel("demo_data/career-decision.xlsx", skip = 1) %>%
janitor::clean_names()

#glimpse(data)
d <- data %>% select(1:61)
#glimpse(d)

### 83.2.2 缺失值检查

d %>%
summarise(
across(everything(), ~sum(is.na(.)))
)
## # A tibble: 1 × 61
##     sex majoy grade  from    z1    z2    z3    z4    z5    z6    z7    z8    z9
##   <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1     0     0     0     0     0     0     0     0     0     0     0     0     0
## # ℹ 48 more variables: z10 <int>, z11 <int>, z12 <int>, z13 <int>, z14 <int>,
## #   z15 <int>, z16 <int>, z17 <int>, z18 <int>, j1 <int>, j2 <int>, j3 <int>,
## #   j4 <int>, j5 <int>, j6 <int>, j7 <int>, j8 <int>, j9 <int>, j10 <int>,
## #   j11 <int>, j12 <int>, j13 <int>, j14 <int>, j15 <int>, j16 <int>,
## #   j17 <int>, j18 <int>, j19 <int>, j20 <int>, j21 <int>, j22 <int>,
## #   j23 <int>, j24 <int>, j25 <int>, j26 <int>, j27 <int>, j28 <int>,
## #   j29 <int>, j30 <int>, j31 <int>, j32 <int>, j33 <int>, j34 <int>, …

### 83.2.3 数据预处理

d <- d %>%
rowwise() %>%
mutate(
environment_exploration          = sum(c_across(z1:z5)),
self_exploration                 = sum(c_across(z6:z9)),
objective_system_exploration     = sum(c_across(z10:z15)),
info_quantity_exploration        = sum(c_across(z16:z18)),

self_evaluation                  = sum(c_across(j1:j6)),
information_collection           = sum(c_across(j7:j15)),
target_select                    = sum(c_across(j16:j24)),
formulate                        = sum(c_across(j25:j32)),
problem_solving                  = sum(c_across(j33:j39)),

career_exploration               = sum(c_across(z1:z18)),
career_decision_making           = sum(c_across(j1:j39))
) %>%
select(-starts_with("z"), -starts_with("j")) %>%
ungroup() %>%
mutate(pid = 1:n(), .before = sex) %>%
mutate(
across(c(pid, sex, majoy, grade, from), as_factor)
)

#glimpse(d)

### 83.2.4 标准化

standard <- function(x) {
(x - mean(x)) / sd(x)
}

d <- d %>%
mutate(
across(where(is.numeric), standard)
)
d
## # A tibble: 304 × 16
##    pid   sex   majoy grade from  environment_exploration self_exploration
##    <fct> <fct> <fct> <fct> <fct>                   <dbl>            <dbl>
##  1 1     1     4     4     2                     -1.63             -1.58
##  2 2     1     4     4     1                     -1.87             -0.723
##  3 3     2     4     4     2                      0.0802            0.132
##  4 4     2     4     4     1                     -1.87             -1.15
##  5 5     2     4     4     1                     -0.895            -1.58
##  6 6     1     1     4     3                     -0.651             1.41
##  7 7     1     4     4     3                     -2.36             -0.723
##  8 8     1     4     4     1                     -0.407            -1.15
##  9 9     1     4     4     3                     -0.651             0.132
## 10 10    1     4     4     2                      0.324             0.132
## # ℹ 294 more rows
## # ℹ 9 more variables: objective_system_exploration <dbl>,
## #   info_quantity_exploration <dbl>, self_evaluation <dbl>,
## #   information_collection <dbl>, target_select <dbl>, formulate <dbl>,
## #   problem_solving <dbl>, career_exploration <dbl>,
## #   career_decision_making <dbl>

## 83.3 探索

### 83.3.1 想探索的问题

• 不同性别（或者年级，生源地，专业）下，各指标分值的差异性
• 两个变量的相关分析和回归分析
• 更多（欢迎大家提出了喔）

### 83.3.2 男生女生在职业探索上有所不同？

d %>%
group_by(sex) %>%
summarise(
across(where(is.numeric), mean)
)
## # A tibble: 2 × 12
##   sex   environment_exploration self_exploration objective_system_exploration
##   <fct>                   <dbl>            <dbl>                        <dbl>
## 1 1                      -0.147          -0.0829                       -0.180
## 2 2                       0.165           0.0933                        0.202
## # ℹ 8 more variables: info_quantity_exploration <dbl>, self_evaluation <dbl>,
## #   information_collection <dbl>, target_select <dbl>, formulate <dbl>,
## #   problem_solving <dbl>, career_exploration <dbl>,
## #   career_decision_making <dbl>

library(ggridges)
## Warning: package 'ggridges' was built under R version 4.2.2
d %>%
ggplot(aes(x = career_exploration, y = sex, fill = sex)) +
geom_density_ridges()
## Picking joint bandwidth of 0.284
t_test_eq <- t.test(career_exploration ~ sex, data = d, var.equal = TRUE) %>%
broom::tidy()
t_test_eq
## # A tibble: 1 × 10
##   estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
##      <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl>
## 1   -0.367    -0.173     0.194     -3.24 0.00132       302   -0.589    -0.144
## # ℹ 2 more variables: method <chr>, alternative <chr>
t_test_uneq <- t.test(career_exploration ~ sex, data = d, var.equal = FALSE) %>%
broom::tidy()
t_test_uneq 
## # A tibble: 1 × 10
##   estimate estimate1 estimate2 statistic p.value parameter conf.low conf.high
##      <dbl>     <dbl>     <dbl>     <dbl>   <dbl>     <dbl>    <dbl>     <dbl>
## 1   -0.367    -0.173     0.194     -3.27 0.00121      302.   -0.588    -0.146
## # ℹ 2 more variables: method <chr>, alternative <chr>

library(infer)
## Warning: package 'infer' was built under R version 4.2.2
obs_diff <- d %>%
specify(formula = career_exploration ~ sex) %>%
calculate("diff in means", order = c("1", "2"))
obs_diff
## Response: career_exploration (numeric)
## Explanatory: sex (factor)
## # A tibble: 1 × 1
##     stat
##    <dbl>
## 1 -0.367
null_dist <- d %>%
specify(formula = career_exploration ~ sex) %>%
hypothesize(null = "independence") %>%
generate(reps = 5000, type = "permute") %>%
calculate(stat = "diff in means", order = c("1", "2"))
null_dist
## Response: career_exploration (numeric)
## Explanatory: sex (factor)
## Null Hypothesis: independence
## # A tibble: 5,000 × 2
##    replicate      stat
##        <int>     <dbl>
##  1         1  0.0461
##  2         2 -0.154
##  3         3 -0.0285
##  4         4 -0.0126
##  5         5 -0.0675
##  6         6 -0.0614
##  7         7  0.000860
##  8         8 -0.00280
##  9         9  0.0619
## 10        10 -0.0761
## # ℹ 4,990 more rows
null_dist %>%
visualize() +
shade_p_value(obs_stat = obs_diff, direction = "two_sided")
null_dist %>%
get_p_value(obs_stat = obs_diff, direction = "two_sided") %>%
#get_p_value(obs_stat = obs_diff, direction = "less") %>%
mutate(p_value_clean = scales::pvalue(p_value))
## # A tibble: 1 × 2
##   p_value p_value_clean
##     <dbl> <chr>
## 1  0.0016 0.002

d %>%
pivot_longer(
cols = -c(pid, sex, majoy, grade, from),
names_to = "index",
values_to = "value"
) %>%
group_by(index) %>%
summarise(
broom::tidy( t.test(value ~ sex, data = cur_data()))
) %>%
select(index, estimate, statistic, p.value) %>%
arrange(p.value)
## Warning: There was 1 warning in summarise().
## ℹ In argument: broom::tidy(t.test(value ~ sex, data = cur_data())).
## ℹ In group 1: index = "career_decision_making".
## Caused by warning:
## ! cur_data() was deprecated in dplyr 1.1.0.
## ℹ Please use pick() instead.
## # A tibble: 11 × 4
##    index                        estimate statistic    p.value
##    <chr>                           <dbl>     <dbl>      <dbl>
##  1 career_decision_making         -0.494     -4.53 0.00000862
##  2 problem_solving                -0.470     -4.26 0.0000270
##  3 target_select                  -0.449     -4.07 0.0000609
##  4 formulate                      -0.411     -3.72 0.000235
##  5 information_collection         -0.411     -3.70 0.000253
##  6 self_evaluation                -0.404     -3.65 0.000315
##  7 objective_system_exploration   -0.382     -3.40 0.000765
##  8 career_exploration             -0.367     -3.27 0.00121
##  9 environment_exploration        -0.312     -2.75 0.00629
## 10 info_quantity_exploration      -0.274     -2.42 0.0162
## 11 self_exploration               -0.176     -1.54 0.126

### 83.3.3 来自不同地方的学生在职业探索上有所不同？

aov(career_exploration ~ from, data = d) %>%
TukeyHSD(which = "from") %>%
broom::tidy()
## # A tibble: 3 × 7
##   term  contrast null.value estimate conf.low conf.high adj.p.value
##   <chr> <chr>         <dbl>    <dbl>    <dbl>     <dbl>       <dbl>
## 1 from  2-1               0   0.382    0.0623     0.701      0.0144
## 2 from  3-1               0   0.287   -0.0386     0.613      0.0964
## 3 from  3-2               0  -0.0943  -0.446      0.257      0.802
library(ggridges)
d %>%
ggplot(aes(x = career_exploration, y = from, fill = from)) +
geom_density_ridges()
## Picking joint bandwidth of 0.312

d %>%
pivot_longer(
cols = -c(pid, sex, majoy, grade, from),
names_to = "index",
values_to = "value"
) %>%
group_by(index) %>%
summarise(
broom::tidy( aov(value ~ from, data = cur_data()))
) %>%
select(index, term, statistic, p.value) %>%
filter(term != "Residuals") %>%
arrange(p.value)
## Warning: Returning more (or less) than 1 row per summarise() group was deprecated in
## dplyr 1.1.0.
## ℹ Please use reframe() instead.
## ℹ When switching from summarise() to reframe(), remember that reframe()
##   always returns an ungrouped data frame and adjust accordingly.
## Call lifecycle::last_lifecycle_warnings() to see where this warning was
## generated.
## summarise() has grouped output by 'index'. You can override using the
## .groups argument.
## # A tibble: 11 × 4
## # Groups:   index [11]
##    index                        term  statistic     p.value
##    <chr>                        <chr>     <dbl>       <dbl>
##  1 problem_solving              from     14.6   0.000000918
##  2 career_decision_making       from     14.2   0.00000126
##  3 formulate                    from     12.2   0.00000781
##  4 information_collection       from     10.2   0.0000527
##  5 self_evaluation              from      8.91  0.000174
##  6 target_select                from      8.45  0.000270
##  7 info_quantity_exploration    from      5.78  0.00344
##  8 career_exploration           from      4.48  0.0121
##  9 objective_system_exploration from      4.06  0.0181
## 10 environment_exploration      from      3.69  0.0260
## 11 self_exploration             from      0.699 0.498

### 83.3.4 职业探索和决策之间有关联？

lm(career_decision_making  ~ career_exploration, data = d)
##
## Call:
## lm(formula = career_decision_making ~ career_exploration, data = d)
##
## Coefficients:
##        (Intercept)  career_exploration
##          2.146e-15           7.831e-01