选取变量
cfps2010family %>%
select(urban, starts_with("fk201_a")) %>%
glimpse()
## Observations: 14,798
## Variables: 7
## $ urban <dbl+lbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ fk201_a_1 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
## $ fk201_a_2 <dbl+lbl> -8.0, -8.0, -8.0, -8.0, 2.5,...
## $ fk201_a_3 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
## $ fk201_a_4 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
## $ fk201_a_5 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
## $ fk201_a_6 <dbl+lbl> -8, -8, -8, -8, -8, -8, -8, ...
获取标签
library(purrr)
get_var_label <- function(dta) {
labels <- map(dta, function(x) attr(x, "label"))
data_frame(
name = names(labels),
label = as.character(labels)
)
}
cfps2010family %>%
select(urban, starts_with("fk201_a")) %>%
get_var_label()
## # A tibble: 7 x 2
## name label
## <chr> <chr>
## 1 urban 基于国家统计局资料的城乡分类变量
## 2 fk201_a_1 您家拥有多少亩水田
## 3 fk201_a_2 您家拥有多少亩旱地
## 4 fk201_a_3 您家拥有多少亩林地
## 5 fk201_a_4 您家拥有多少亩果园
## 6 fk201_a_5 您家拥有多少亩草场
## 7 fk201_a_6 您家拥有多少亩池塘
数据统计
cfps2010family %>%
select(urban, starts_with("fk201_a")) %>%
map(~ count(data.frame(x = .x), x))
## $urban
## # A tibble: 2 x 2
## x n
## <dbl+lbl> <int>
## 1 0 7694
## 2 1 7104
##
## $fk201_a_1
## # A tibble: 114 x 2
## x n
## <dbl+lbl> <int>
## 1 -8.0 11657
## 2 -1.0 3
## 3 0.0 20
## 4 0.1 6
## 5 0.2 14
## 6 0.3 36
## 7 0.4 36
## 8 0.5 74
## 9 0.6 54
## 10 0.7 49
## # ... with 104 more rows
##
## $fk201_a_2
## # A tibble: 198 x 2
## x n
## <dbl+lbl> <int>
## 1 -8.0 8553
## 2 -1.0 12
## 3 0.0 42
## 4 0.1 31
## 5 0.2 67
## 6 0.3 74
## 7 0.4 39
## 8 0.5 178
## 9 0.6 64
## 10 0.7 50
## # ... with 188 more rows
##
## $fk201_a_3
## # A tibble: 115 x 2
## x n
## <dbl+lbl> <int>
## 1 -8.0 13700
## 2 -1.0 5
## 3 0.0 13
## 4 0.1 10
## 5 0.2 10
## 6 0.3 17
## 7 0.4 6
## 8 0.5 30
## 9 0.6 6
## 10 0.7 6
## # ... with 105 more rows
##
## $fk201_a_4
## # A tibble: 59 x 2
## x n
## <dbl+lbl> <int>
## 1 -8.0 14152
## 2 -1.0 6
## 3 0.0 19
## 4 0.1 6
## 5 0.2 9
## 6 0.3 10
## 7 0.4 4
## 8 0.5 33
## 9 0.6 2
## 10 0.7 4
## # ... with 49 more rows
##
## $fk201_a_5
## # A tibble: 5 x 2
## x n
## <dbl+lbl> <int>
## 1 -8.0 14790
## 2 1.0 4
## 3 2.0 2
## 4 3.0 1
## 5 3.3 1
##
## $fk201_a_6
## # A tibble: 36 x 2
## x n
## <dbl+lbl> <int>
## 1 -8.0 14707
## 2 0.0 10
## 3 0.1 1
## 4 0.2 3
## 5 0.3 3
## 6 0.4 1
## 7 0.5 7
## 8 0.6 1
## 9 0.7 1
## 10 0.8 2
## # ... with 26 more rows
# map_if(is.character, ~count(data.frame(x = .x), x) )
缺失值
library(naniar)
cfps2010family %>%
select(urban, starts_with("fk201_a")) %>%
miss_var_summary()
## # A tibble: 7 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 urban 0 0
## 2 fk201_a_1 0 0
## 3 fk201_a_2 0 0
## 4 fk201_a_3 0 0
## 5 fk201_a_4 0 0
## 6 fk201_a_5 0 0
## 7 fk201_a_6 0 0
library(visdat)
cfps2010family %>%
select(urban, starts_with("fk201_a")) %>%
vis_dat()
数据规整
cfps2010family %>%
select(urban, starts_with("fk2_s")) %>%
filter(urban == "0")
## # A tibble: 7,694 x 6
## urban fk2_s_1 fk2_s_2 fk2_s_3 fk2_s_4 fk2_s_5
## <dbl+l> <dbl+lb> <dbl+lb> <dbl+lb> <dbl+lb> <dbl+lb>
## 1 0 4 -8 -8 -8 -8
## 2 0 2 -8 -8 -8 -8
## 3 0 4 -8 -8 -8 -8
## 4 0 4 -8 -8 -8 -8
## 5 0 -8 -8 -8 -8 -8
## 6 0 -8 -8 -8 -8 -8
## 7 0 -8 -8 -8 -8 -8
## 8 0 6 -8 -8 -8 -8
## 9 0 4 -8 -8 -8 -8
## 10 0 -8 -8 -8 -8 -8
## # ... with 7,684 more rows
# filter_if,filter_at,filter_all
# para 1:tbl, para 2:column, para 3:row
a <- cfps2010family %>%
select(urban, starts_with("fk201_a")) %>%
filter_at(vars(starts_with("fk201_a")), any_vars(. > 0))
a
## # A tibble: 7,688 x 7
## urban fk201_a_1 fk201_a_2 fk201_a_3 fk201_a_4
## <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 1 -8 2.5 -8 -8.0
## 2 1 -8 -8.0 -8 0.9
## 3 0 -8 -8.0 -8 5.1
## 4 0 -8 1.8 -8 -8.0
## 5 0 -8 -8.0 -8 6.0
## 6 0 -8 -8.0 -8 -8.0
## 7 0 -8 -8.0 -8 4.0
## 8 0 -8 -8.0 -8 1.5
## 9 0 -8 -8.0 -8 3.0
## 10 0 -8 1.5 -8 -8.0
## # ... with 7,678 more rows, and 2 more variables:
## # fk201_a_5 <dbl+lbl>, fk201_a_6 <dbl+lbl>
a %>% mutate_at(vars(starts_with("fk201_a")), funs(replace(., . < 0, 0)))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## please use list() instead
##
## # Before:
## funs(name = f(.)
##
## # After:
## list(name = ~f(.))
## This warning is displayed once per session.
## # A tibble: 7,688 x 7
## urban fk201_a_1 fk201_a_2 fk201_a_3 fk201_a_4
## <dbl> <dbl+lbl> <dbl+lbl> <dbl+lbl> <dbl+lbl>
## 1 1 0 2.5 0 0.0
## 2 1 0 0.0 0 0.9
## 3 0 0 0.0 0 5.1
## 4 0 0 1.8 0 0.0
## 5 0 0 0.0 0 6.0
## 6 0 0 0.0 0 0.0
## 7 0 0 0.0 0 4.0
## 8 0 0 0.0 0 1.5
## 9 0 0 0.0 0 3.0
## 10 0 0 1.5 0 0.0
## # ... with 7,678 more rows, and 2 more variables:
## # fk201_a_5 <dbl+lbl>, fk201_a_6 <dbl+lbl>