4 数据预处理(dplyr)
4.1 导入数据
# 用 readr 包把数据读取成 tbl 格式,省去转化的那一步。
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(readr)
order <- read_csv('~/Nutstore/R/dplyr/dplyr-data/order.csv')[-1] # 去除空名的第一列
#> Warning: Missing column names filled in: 'X1' [1]
#> Parsed with column specification:
#> cols(
#> X1 = col_integer(),
#> orderid = col_integer(),
#> customerid = col_integer(),
#> campaignid = col_integer(),
#> orderdate = col_date(format = ""),
#> city = col_character(),
#> state = col_character(),
#> zipcode = col_character(),
#> paymenttype = col_character(),
#> totalprice = col_double(),
#> numorderlines = col_integer(),
#> numunits = col_integer()
#> )
order
#> # A tibble: 100,000 x 11
#> orderid customerid campaignid orderdate city state zipcode
#> <int> <int> <int> <date> <chr> <chr> <chr>
#> 1 1002854 45978 2141 2009-10-13 NEWTON MA 02459
#> 2 1002855 125381 2173 2009-10-13 NEW ROCHELLE NY 10804
#> 3 1002856 103122 2141 2011-06-02 MIAMI FL 33137
#> 4 1002857 130980 2173 2009-10-14 E RUTHERFORD NJ 07073
#> 5 1002886 48553 2141 2010-11-19 BALTIMORE MD 21218
#> 6 1002887 106150 2173 2009-10-15 ROWAYTON CT 06853
#> # ... with 9.999e+04 more rows, and 4 more variables: paymenttype <chr>,
#> # totalprice <dbl>, numorderlines <int>, numunits <int>
4.2 数据筛选 filter,返回满足条件的观测值
# 看 2009-10-15 日的数据
filter(order, orderdate == '2009-10-13')
#> # A tibble: 19 x 11
#> orderid customerid campaignid orderdate city state zipcode
#> <int> <int> <int> <date> <chr> <chr> <chr>
#> 1 1002854 45978 2141 2009-10-13 NEWTON MA 02459
#> 2 1002855 125381 2173 2009-10-13 NEW ROCHELLE NY 10804
#> 3 1002442 112970 2173 2009-10-13 BRANFORD CT 06405
#> 4 1002471 125377 2173 2009-10-13 VALLEY FORGE PA 19482
#> 5 1002472 133164 2173 2009-10-13 FORT WORTH TX 76107
#> 6 1003224 129708 2173 2009-10-13 CERRO GORDO NC 28430
#> # ... with 13 more rows, and 4 more variables: paymenttype <chr>,
#> # totalprice <dbl>, numorderlines <int>, numunits <int>
# 更细的筛选
filter(order, orderdate == '2009-10-13' & totalprice > 100) -> new_tbl
new_tbl
#> # A tibble: 1 x 11
#> orderid customerid campaignid orderdate city state zipcode
#> <int> <int> <int> <date> <chr> <chr> <chr>
#> 1 1002854 45978 2141 2009-10-13 NEWTON MA 02459
#> # ... with 4 more variables: paymenttype <chr>, totalprice <dbl>,
#> # numorderlines <int>, numunits <int>
4.3 子集选取 select,只保留选择的变量
# 选取几个子集
names(order)
#> [1] "orderid" "customerid" "campaignid" "orderdate"
#> [5] "city" "state" "zipcode" "paymenttype"
#> [9] "totalprice" "numorderlines" "numunits"
select(order, city, numunits, zipcode)
#> # A tibble: 100,000 x 3
#> city numunits zipcode
#> <chr> <int> <chr>
#> 1 NEWTON 3 02459
#> 2 NEW ROCHELLE 1 10804
#> 3 MIAMI 2 33137
#> 4 E RUTHERFORD 1 07073
#> 5 BALTIMORE 1 21218
#> 6 ROWAYTON 1 06853
#> # ... with 9.999e+04 more rows
# 换名, 提取选择的数据列
select(order, date = orderdate, price = totalprice)
#> # A tibble: 100,000 x 2
#> date price
#> <date> <dbl>
#> 1 2009-10-13 190.0
#> 2 2009-10-13 10.0
#> 3 2011-06-02 35.2
#> 4 2009-10-14 10.0
#> 5 2010-11-19 10.0
#> 6 2009-10-15 10.0
#> # ... with 9.999e+04 more rows
names(order)
#> [1] "orderid" "customerid" "campaignid" "orderdate"
#> [5] "city" "state" "zipcode" "paymenttype"
#> [9] "totalprice" "numorderlines" "numunits"
# rename,换名,提取所有列
rename(order, date = orderdate, price = totalprice)
#> # A tibble: 100,000 x 11
#> orderid customerid campaignid date city state zipcode
#> <int> <int> <int> <date> <chr> <chr> <chr>
#> 1 1002854 45978 2141 2009-10-13 NEWTON MA 02459
#> 2 1002855 125381 2173 2009-10-13 NEW ROCHELLE NY 10804
#> 3 1002856 103122 2141 2011-06-02 MIAMI FL 33137
#> 4 1002857 130980 2173 2009-10-14 E RUTHERFORD NJ 07073
#> 5 1002886 48553 2141 2010-11-19 BALTIMORE MD 21218
#> 6 1002887 106150 2173 2009-10-15 ROWAYTON CT 06853
#> # ... with 9.999e+04 more rows, and 4 more variables: paymenttype <chr>,
#> # price <dbl>, numorderlines <int>, numunits <int>
names(order)
#> [1] "orderid" "customerid" "campaignid" "orderdate"
#> [5] "city" "state" "zipcode" "paymenttype"
#> [9] "totalprice" "numorderlines" "numunits"
# 选取以 order 开始的变量
select(order, starts_with('order'))
#> # A tibble: 100,000 x 2
#> orderid orderdate
#> <int> <date>
#> 1 1002854 2009-10-13
#> 2 1002855 2009-10-13
#> 3 1002856 2011-06-02
#> 4 1002857 2009-10-14
#> 5 1002886 2010-11-19
#> 6 1002887 2009-10-15
#> # ... with 9.999e+04 more rows
# 选取包含id 的变量
select(order, contains('id'))
#> # A tibble: 100,000 x 3
#> orderid customerid campaignid
#> <int> <int> <int>
#> 1 1002854 45978 2141
#> 2 1002855 125381 2173
#> 3 1002856 103122 2141
#> 4 1002857 130980 2173
#> 5 1002886 48553 2141
#> 6 1002887 106150 2173
#> # ... with 9.999e+04 more rows
4.4 数据排序 arrange
tbl <- select(order, date = orderdate, price = totalprice)
arrange(tbl, date, desc(price))
#> # A tibble: 100,000 x 2
#> date price
#> <date> <dbl>
#> 1 2009-10-04 200
#> 2 2009-10-04 120
#> 3 2009-10-04 100
#> 4 2009-10-04 100
#> 5 2009-10-04 70
#> 6 2009-10-04 50
#> # ... with 9.999e+04 more rows
# 管道操作,更简便
tbl <- select(order, date = orderdate, price = totalprice) %>%
arrange(date, desc(price))
tbl
#> # A tibble: 100,000 x 2
#> date price
#> <date> <dbl>
#> 1 2009-10-04 200
#> 2 2009-10-04 120
#> 3 2009-10-04 100
#> 4 2009-10-04 100
#> 5 2009-10-04 70
#> 6 2009-10-04 50
#> # ... with 9.999e+04 more rows
4.5 数据扩展 mutate
tbl <- select(order, date = orderdate, price = totalprice) %>%
arrange(date,desc(price)) %>%
mutate(year = substr(date,1,4), month = substr(date,6,7),
day = substr(date,9,10))
tbl
#> # A tibble: 100,000 x 5
#> date price year month day
#> <date> <dbl> <chr> <chr> <chr>
#> 1 2009-10-04 200 2009 10 04
#> 2 2009-10-04 120 2009 10 04
#> 3 2009-10-04 100 2009 10 04
#> 4 2009-10-04 100 2009 10 04
#> 5 2009-10-04 70 2009 10 04
#> 6 2009-10-04 50 2009 10 04
#> # ... with 9.999e+04 more rows
# transmute 则会删除原有的变量,如同 select 与 rename
4.6 数据汇总 summarise
summarise(tbl, max = max(price),
min = min(price),
mean(price))
#> # A tibble: 1 x 3
#> max min mean(price)
#> <dbl> <dbl> <dbl>
#> 1 6780 0 60.8
summarise(tbl, fisrt = first(date),
last = last(date))
#> # A tibble: 1 x 2
#> fisrt last
#> <date> <date>
#> 1 2009-10-04 2014-05-29
# 管道操作
df <- select(order, date = orderdate, price = totalprice) %>%
summarise(price_sum = sum(price), price_mean = mean(price))
df
#> # A tibble: 1 x 2
#> price_sum price_mean
#> <dbl> <dbl>
#> 1 6077773 60.8
4.7 数据连接 join
- left_join 以左边为主,且保留所有值,用的最多,b 表追加在后面
- inner_join 只保留匹配的
- semi_join 只匹配左边的列
- anti_join 与上面函数相反
4.8 分组汇总 group_by
tbl <- select(order, date = orderdate, price = totalprice) %>%
mutate(year = substr(date,1,4), month = substr(date,6,7),
day = substr(date,9,10))
by_year <- group_by(tbl, year) %>%
summarise(mean(price), sum(price), max(month))
by_year
#> # A tibble: 6 x 4
#> year mean(price) sum(price) max(month)
#> <chr> <dbl> <dbl> <chr>
#> 1 2009 34.1 262628 12
#> 2 2010 52.2 967429 12
#> 3 2011 51.4 1380637 12
#> 4 2012 68.4 1404113 12
#> 5 2013 76.7 1633005 12
#> 6 2014 84.5 429962 05
order_year <- group_by(tbl, year)