Part 5 Week 2
Getting Started with Data Visualization in R > Week 2
5.5 Grouping and summarizing data
#####Data Wrangling with the tidyverse
### install the tidyverse if you don't have it installed. You only have to do this once.
#install.packages("tidyverse")
###load the tidyverse functions #### Do this everytime you want to use tidyverse commands
library(tidyverse)
####Use read_csv instead of read.csv
#### make sure you have the file in your working directory, or use the complete file path. Use setwd() if you need to.
# setwd(")
<- read_csv("week2/cces_sample_coursera.csv") cces
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> .default = col_double()
#> )
#> ℹ Use `spec()` for the full column specifications.
#### read_csv produces a tibble rather than a dataframe.
class(cces)
#> [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
# vignette("tibble")
##### if you need to switch back and forth between tibble and dataframe for some reason
<- as.data.frame(cces)
cces_dataframe <- as_tibble(cces_dataframe)
cces_tibble
####drop rows with missing data
<- drop_na(cces)
cces
##### Use the filter function
####selects only women respondents
<- filter(cces, gender == 2)
women
####remember the other logical operators
# >
# <
# <=
# >=
# &
# |
# %in%
dim(cces)
#> [1] 869 25
dim(women)
#> [1] 478 25
table(cces$gender)
#>
#> 1 2
#> 391 478
$gender women
#> [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [149] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [223] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [260] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [297] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [334] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [371] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [408] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [445] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
<- filter(cces, gender == 2 & pid7 > 4)
republican_women
dim(republican_women)
#> [1] 154 25
table(republican_women$pid7)
#>
#> 5 6 7
#> 28 36 90
$gender republican_women
#> [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [149] 2 2 2 2 2 2
$pid7 republican_women
#> [1] 7 5 7 5 7 7 7 7 6 7 6 6 7 7 7 7 5 7 6 5 7 6 7 5 7 5 7 6 7 7 7 7 6 7 7 6 5
#> [38] 6 7 5 7 5 7 6 5 7 7 7 6 7 7 5 5 7 7 5 6 7 7 7 7 7 7 7 5 7 6 7 5 7 7 7 7 7
#> [75] 7 7 5 7 7 7 7 7 7 7 6 7 7 7 7 6 6 7 6 7 6 5 6 7 6 7 7 7 6 7 5 7 7 7 7 7 5
#> [112] 6 7 7 6 6 6 7 7 7 7 7 6 5 6 6 5 6 6 5 6 6 6 6 7 5 7 7 5 6 7 7 7 7 5 7 7 6
#> [149] 7 7 7 5 5 5
head(republican_women)
#> # A tibble: 6 x 25
#> caseid region gender educ edloan race hispanic employ marstat pid7 ideo5
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 412873566 2 2 3 2 1 2 7 1 7 5
#> 2 416479672 3 2 2 2 1 2 5 4 5 3
#> 3 412985437 3 2 4 2 1 2 6 3 7 3
#> 4 413425778 4 2 3 2 1 2 1 5 5 4
#> 5 413859987 3 2 6 1 1 2 1 1 7 5
#> 6 412380857 4 2 6 1 1 2 1 1 7 3
#> # … with 14 more variables: pew_religimp <dbl>, newsint <dbl>,
#> # faminc_new <dbl>, union <dbl>, investor <dbl>, CC18_308a <dbl>,
#> # CC18_310a <dbl>, CC18_310b <dbl>, CC18_310c <dbl>, CC18_310d <dbl>,
#> # CC18_325a <dbl>, CC18_325b <dbl>, CC18_325c <dbl>, CC18_325d <dbl>
####select certain columns from the data
select(republican_women, "educ", "employ")
#> # A tibble: 154 x 2
#> educ employ
#> <dbl> <dbl>
#> 1 3 7
#> 2 2 5
#> 3 4 6
#> 4 3 1
#> 5 6 1
#> 6 6 1
#> # … with 148 more rows
####combine multiple commands using piping
#x %>% f(y) is the same as f(x, y)
#y %>% f(x, ., z) is the same as f(x, y, z)
<-
women_republicans_educ_employ %>% filter(gender == 2 & pid7 > 4) %>% select("educ", "employ")
cces
####recode variables
<-
party recode(
$pid7,
cces`1` = "Democrat",
`2` = "Democrat",
`3` = "Democrat",
`4` = "Independent",
`5` = "Republican",
`6` = "Republican",
`7` = "Republican"
)
$party <- party
cces
####rename variables
<- rename(cces, trump_approval = CC18_308a)
test
$trump_approval test
#> [1] 2 4 4 4 4 1 1 4 1 4 4 1 4 4 4 4 4 1 4 4 2 2 1 4 1 4 1 4 4 2 4 4 4 4 4 4 2
#> [38] 4 1 4 2 1 1 3 3 1 1 4 4 4 1 4 4 3 2 3 4 4 4 4 4 4 4 4 4 4 4 4 4 2 1 1 1 1
#> [75] 4 1 2 4 4 3 4 4 4 2 4 4 4 4 4 4 4 3 2 4 2 4 1 1 4 1 4 4 1 1 1 4 1 1 1 4 4
#> [112] 4 1 4 4 1 4 4 4 4 1 2 4 4 4 4 4 4 1 4 3 4 1 1 4 4 3 1 1 4 4 1 2 4 4 4 4 3
#> [149] 4 3 4 4 4 1 4 4 3 1 2 2 4 4 4 1 4 4 3 4 4 4 1 1 4 3 1 1 4 4 2 4 3 4 4 4 4
#> [186] 4 4 3 3 1 4 4 2 4 4 4 1 2 3 1 4 4 4 4 4 1 1 3 4 2 4 4 4 4 1 1 2 1 3 1 4 2
#> [223] 1 4 4 1 4 2 3 4 1 4 1 4 1 4 3 1 4 4 4 2 1 4 2 1 1 4 2 4 2 4 1 4 1 4 1 4 2
#> [260] 1 4 4 3 2 1 4 3 4 3 1 2 4 4 1 3 2 2 1 1 4 2 2 1 4 4 4 1 3 4 1 4 1 1 3 4 1
#> [297] 4 2 4 4 1 1 1 1 4 1 2 1 3 1 1 3 4 1 2 4 4 3 4 4 4 3 1 2 4 4 1 4 4 4 1 4 2
#> [334] 4 4 4 4 4 4 4 2 2 1 4 2 2 1 4 1 2 3 4 2 1 3 2 2 1 2 4 1 1 3 1 4 1 1 4 4 1
#> [371] 4 3 1 1 1 4 1 4 2 3 2 4 4 1 1 1 1 4 1 4 1 2 1 1 4 1 4 1 4 1 2 4 2 2 4 4 4
#> [408] 1 2 4 4 1 1 4 1 1 1 4 2 4 2 4 1 4 4 4 1 4 4 3 4 4 4 2 4 1 3 3 4 2 4 1 4 1
#> [445] 4 1 4 4 4 1 3 1 2 3 1 3 4 1 2 1 2 3 1 4 3 4 1 2 4 1 1 2 3 2 1 4 4 4 3 4 1
#> [482] 4 4 4 1 4 1 4 4 4 2 4 4 4 2 1 1 4 3 1 2 2 1 4 2 4 4 1 1 2 4 3 1 4 4 1 3 4
#> [519] 4 1 2 4 4 4 2 4 4 2 1 2 4 2 2 2 4 4 1 3 4 4 4 2 4 1 2 1 2 1 3 4 2 4 4 2 2
#> [556] 4 2 1 1 1 4 4 4 1 4 4 1 4 2 1 2 2 2 4 1 4 1 4 2 4 4 4 1 3 2 4 1 4 4 4 1 4
#> [593] 4 4 1 4 4 4 3 4 4 1 4 4 4 4 4 1 1 4 4 4 3 2 4 4 4 4 1 4 4 3 1 4 1 4 4 4 4
#> [630] 1 4 4 2 3 1 4 4 4 4 4 4 1 4 2 4 1 1 4 1 4 1 2 4 4 4 1 1 4 4 2 4 3 4 4 1 4
#> [667] 1 4 1 2 1 4 2 2 4 4 4 4 4 4 2 4 2 3 4 2 4 1 4 4 4 2 4 2 4 1 2 4 1 4 4 4 1
#> [704] 4 4 1 4 1 1 4 2 4 1 4 4 2 2 2 1 4 4 4 4 2 4 1 2 4 4 2 4 1 4 4 1 4 2 1 2 4
#> [741] 4 1 4 3 4 1 4 1 2 4 4 1 1 1 4 2 4 4 1 2 2 4 4 1 2 3 3 3 4 4 3 1 4 1 4 1 1
#> [778] 3 4 4 4 3 1 4 2 4 1 2 4 2 4 4 4 3 3 1 4 4 2 2 4 2 1 1 4 4 1 1 3 1 3 4 1 4
#> [815] 4 4 3 4 4 3 1 4 1 1 4 2 1 4 4 2 4 4 4 4 4 4 1 4 2 4 1 4 2 1 1 1 1 4 2 1 2
#> [852] 1 3 2 4 1 4 4 4 1 1 2 2 2 2 4 4 4 4
<- test
cces $trump_approval cces
#> [1] 2 4 4 4 4 1 1 4 1 4 4 1 4 4 4 4 4 1 4 4 2 2 1 4 1 4 1 4 4 2 4 4 4 4 4 4 2
#> [38] 4 1 4 2 1 1 3 3 1 1 4 4 4 1 4 4 3 2 3 4 4 4 4 4 4 4 4 4 4 4 4 4 2 1 1 1 1
#> [75] 4 1 2 4 4 3 4 4 4 2 4 4 4 4 4 4 4 3 2 4 2 4 1 1 4 1 4 4 1 1 1 4 1 1 1 4 4
#> [112] 4 1 4 4 1 4 4 4 4 1 2 4 4 4 4 4 4 1 4 3 4 1 1 4 4 3 1 1 4 4 1 2 4 4 4 4 3
#> [149] 4 3 4 4 4 1 4 4 3 1 2 2 4 4 4 1 4 4 3 4 4 4 1 1 4 3 1 1 4 4 2 4 3 4 4 4 4
#> [186] 4 4 3 3 1 4 4 2 4 4 4 1 2 3 1 4 4 4 4 4 1 1 3 4 2 4 4 4 4 1 1 2 1 3 1 4 2
#> [223] 1 4 4 1 4 2 3 4 1 4 1 4 1 4 3 1 4 4 4 2 1 4 2 1 1 4 2 4 2 4 1 4 1 4 1 4 2
#> [260] 1 4 4 3 2 1 4 3 4 3 1 2 4 4 1 3 2 2 1 1 4 2 2 1 4 4 4 1 3 4 1 4 1 1 3 4 1
#> [297] 4 2 4 4 1 1 1 1 4 1 2 1 3 1 1 3 4 1 2 4 4 3 4 4 4 3 1 2 4 4 1 4 4 4 1 4 2
#> [334] 4 4 4 4 4 4 4 2 2 1 4 2 2 1 4 1 2 3 4 2 1 3 2 2 1 2 4 1 1 3 1 4 1 1 4 4 1
#> [371] 4 3 1 1 1 4 1 4 2 3 2 4 4 1 1 1 1 4 1 4 1 2 1 1 4 1 4 1 4 1 2 4 2 2 4 4 4
#> [408] 1 2 4 4 1 1 4 1 1 1 4 2 4 2 4 1 4 4 4 1 4 4 3 4 4 4 2 4 1 3 3 4 2 4 1 4 1
#> [445] 4 1 4 4 4 1 3 1 2 3 1 3 4 1 2 1 2 3 1 4 3 4 1 2 4 1 1 2 3 2 1 4 4 4 3 4 1
#> [482] 4 4 4 1 4 1 4 4 4 2 4 4 4 2 1 1 4 3 1 2 2 1 4 2 4 4 1 1 2 4 3 1 4 4 1 3 4
#> [519] 4 1 2 4 4 4 2 4 4 2 1 2 4 2 2 2 4 4 1 3 4 4 4 2 4 1 2 1 2 1 3 4 2 4 4 2 2
#> [556] 4 2 1 1 1 4 4 4 1 4 4 1 4 2 1 2 2 2 4 1 4 1 4 2 4 4 4 1 3 2 4 1 4 4 4 1 4
#> [593] 4 4 1 4 4 4 3 4 4 1 4 4 4 4 4 1 1 4 4 4 3 2 4 4 4 4 1 4 4 3 1 4 1 4 4 4 4
#> [630] 1 4 4 2 3 1 4 4 4 4 4 4 1 4 2 4 1 1 4 1 4 1 2 4 4 4 1 1 4 4 2 4 3 4 4 1 4
#> [667] 1 4 1 2 1 4 2 2 4 4 4 4 4 4 2 4 2 3 4 2 4 1 4 4 4 2 4 2 4 1 2 4 1 4 4 4 1
#> [704] 4 4 1 4 1 1 4 2 4 1 4 4 2 2 2 1 4 4 4 4 2 4 1 2 4 4 2 4 1 4 4 1 4 2 1 2 4
#> [741] 4 1 4 3 4 1 4 1 2 4 4 1 1 1 4 2 4 4 1 2 2 4 4 1 2 3 3 3 4 4 3 1 4 1 4 1 1
#> [778] 3 4 4 4 3 1 4 2 4 1 2 4 2 4 4 4 3 3 1 4 4 2 2 4 2 1 1 4 4 1 1 3 1 3 4 1 4
#> [815] 4 4 3 4 4 3 1 4 1 1 4 2 1 4 4 2 4 4 4 4 4 4 1 4 2 4 1 4 2 1 1 1 1 4 2 1 2
#> [852] 1 3 2 4 1 4 4 4 1 1 2 2 2 2 4 4 4 4
####calculate new numeric variables
<- recode(
rec_sen1_01 $CC18_310b,
cces`1` = 0,
`5` = 0,
`2` = 1,
`3` = 1,
`4` = 1
)
<- recode(
rec_sen2_01 $CC18_310c,
cces`1` = 0,
`5` = 0,
`2` = 1,
`3` = 1,
`4` = 1
)
$rec_sen1_01 <- rec_sen1_01
cces
$rec_sen2_01 <- rec_sen2_01
cces
<- mutate(cces, know_sens = rec_sen1_01 + rec_sen2_01)
cces $know_sens cces
#> [1] 2 1 2 2 2 2 2 2 2 2 2 2 0 1 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [38] 2 2 0 2 2 2 0 2 2 2 0 2 1 1 0 2 2 2 2 1 2 1 2 2 2 2 1 0 0 2 2 2 2 2 2 2 2
#> [75] 0 2 2 2 1 0 0 2 1 1 2 2 2 2 2 0 2 1 2 0 2 2 1 2 1 2 0 2 2 1 2 2 2 2 2 1 2
#> [112] 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 0 2 2 2 2 2 2 0 2 2 2 2 2 1 2 2 2 2 2
#> [149] 2 2 2 2 2 1 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 1 2 2 2 2 1
#> [186] 2 0 0 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 2 2 2 2 2 2 1 2
#> [223] 2 2 2 2 2 2 2 1 2 2 0 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 1 2 2 2
#> [260] 2 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 0
#> [297] 1 1 2 0 2 2 2 2 0 0 0 2 2 1 2 2 1 2 2 2 2 2 2 2 1 0 0 1 2 2 2 2 2 2 2 2 2
#> [334] 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2
#> [371] 2 1 2 2 2 2 0 2 2 0 2 1 2 2 1 2 2 1 2 1 2 2 2 2 2 0 2 2 2 2 1 2 2 2 2 2 2
#> [408] 2 1 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 2 2 2 1 0 2 2
#> [445] 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 0 1 2 2 2 1 2 2 2 2 1 2 2 1 2 2 2 2 2 2 2
#> [482] 2 2 2 1 1 2 2 0 2 0 2 2 2 1 2 0 2 0 1 0 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2
#> [519] 2 2 2 2 2 2 2 0 2 2 2 2 2 1 2 2 2 2 2 0 2 2 2 1 1 2 2 2 2 2 2 2 2 2 2 2 0
#> [556] 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2 2 2 2 1 2 1 2 2 1 2 1 2 2 2 2 2
#> [593] 2 0 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 2 0 2 2
#> [630] 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 1 2 2 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2
#> [667] 1 2 2 1 2 2 2 2 2 0 2 2 1 2 2 2 1 2 2 0 0 2 2 2 2 1 2 0 2 2 2 2 2 0 2 2 1
#> [704] 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 1 2 0 2 2 2 1 2 1 2 0 2 2 2 2 2 2 0 2
#> [741] 0 0 2 2 0 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 1 2 2 2 2 0 1 0 2 0 2 2 2 2 1
#> [778] 1 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2
#> [815] 2 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 0 2 2 2 2 0 2 2 2 1 2 2 2 0
#> [852] 2 2 1 1 2 2 2 2 2 2 2 2 2 0 2 2 2 2
##### reorder rows by column values
<- cces %>% arrange(gender, pid7)
sorted_by_gender_and_party
# View(sorted_by_gender_and_party)
<- cces %>% arrange(gender, desc(pid7))
sorted_by_gender_and_party
# View(sorted_by_gender_and_party)
##### add grouping to data
<- cces %>% group_by(gender, pid7)
grouped_gender_pid7 grouped_gender_pid7
#> # A tibble: 869 x 29
#> # Groups: gender, pid7 [14]
#> caseid region gender educ edloan race hispanic employ marstat pid7 ideo5
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 417614315 3 1 2 2 1 2 5 3 6 3
#> 2 415490556 1 2 6 2 1 1 1 1 2 2
#> 3 414351505 3 2 3 2 2 2 1 4 2 3
#> 4 411855339 1 2 5 2 6 2 5 3 3 1
#> 5 420208067 2 1 3 2 1 2 1 1 4 5
#> 6 412517331 1 1 2 2 1 2 5 5 5 4
#> # … with 863 more rows, and 18 more variables: pew_religimp <dbl>,
#> # newsint <dbl>, faminc_new <dbl>, union <dbl>, investor <dbl>,
#> # trump_approval <dbl>, CC18_310a <dbl>, CC18_310b <dbl>, CC18_310c <dbl>,
#> # CC18_310d <dbl>, CC18_325a <dbl>, CC18_325b <dbl>, CC18_325c <dbl>,
#> # CC18_325d <dbl>, party <chr>, rec_sen1_01 <dbl>, rec_sen2_01 <dbl>,
#> # know_sens <dbl>
###remove grouping with ungroup
#ungroup(grouped_gender_pid7)
##### summarize the data
# ?summarise
summarise(cces,
mean_pid7 = mean(pid7),
mean_faminc = mean(faminc_new))
#> # A tibble: 1 x 2
#> mean_pid7 mean_faminc
#> <dbl> <dbl>
#> 1 3.62 6.58
#####you could do the same thing with piping
#cces %>% summarise(mean_pid7=mean(pid7),mean_faminc=mean(faminc_new))
###when you summarise grouped data, you get summaries by group
<- cces %>% group_by(gender)
grouped_gender summarise(grouped_gender,
mean_pid7 = mean(pid7),
mean_faminc = mean(faminc_new))
#> # A tibble: 2 x 3
#> gender mean_pid7 mean_faminc
#> <dbl> <dbl> <dbl>
#> 1 1 3.93 7.18
#> 2 2 3.37 6.09
####combine all of this with piping if you want to look like a pro and have fewer lines of code
#cces %>% group_by(gender) %>% summarise(mean=mean(pid7))
5.6 Practices
library(tidyverse)
<- drop_na(read_csv(url("https://www.dropbox.com/s/uhfstf6g36ghxwp/cces_sample_coursera.csv?raw=1"))) dat
#>
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#> .default = col_double()
#> )
#> ℹ Use `spec()` for the full column specifications.
# Problem 1
# The object "dat" created in the assignment code will import the survey data for the assignment using read_csv, thereby creating a tibble. Using that object as your data, use select() to create a new tibble that include only the columns for educational level, whether the respondent has an educational loan, employment status, and Trump approval. Display that object. Hint: consult the codebook to identify the correct column names.
#### Write your code below:
%>% head() dat
#> # A tibble: 6 x 25
#> caseid region gender educ edloan race hispanic employ marstat pid7 ideo5
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 417614315 3 1 2 2 1 2 5 3 6 3
#> 2 415490556 1 2 6 2 1 1 1 1 2 2
#> 3 414351505 3 2 3 2 2 2 1 4 2 3
#> 4 411855339 1 2 5 2 6 2 5 3 3 1
#> 5 420208067 2 1 3 2 1 2 1 1 4 5
#> 6 412517331 1 1 2 2 1 2 5 5 5 4
#> # … with 14 more variables: pew_religimp <dbl>, newsint <dbl>,
#> # faminc_new <dbl>, union <dbl>, investor <dbl>, CC18_308a <dbl>,
#> # CC18_310a <dbl>, CC18_310b <dbl>, CC18_310c <dbl>, CC18_310d <dbl>,
#> # CC18_325a <dbl>, CC18_325b <dbl>, CC18_325c <dbl>, CC18_325d <dbl>
<- dat %>% select(educ, edloan, employ, CC18_308a)
df %>% head() df
#> # A tibble: 6 x 4
#> educ edloan employ CC18_308a
#> <dbl> <dbl> <dbl> <dbl>
#> 1 2 2 5 2
#> 2 6 2 1 4
#> 3 3 2 1 4
#> 4 5 2 5 4
#> 5 3 2 1 4
#> 6 2 2 5 1
# Problem 2
# Continuing to use the new data table you created in Problem 1, use recode() to create a new column named "trump_approve_disapprove" that recodes the column for President Trump's job approval. A value of "1" should mean that the respondent either "strongly" or "somewhat" approves of the President, and a value of 0 should mean that the respondent either "strongly" or "somewhat" DISapproves of the president. Display the resulting object.
#### Write your code below:
CC18_308a Job approval by President Trump
Do you approve or disapprove of the way each is doing their job…
1 Strongly approve
2 Somewhat approve
3 Somewhat disapprove
4 Strongly disapprove
$trump_approve_disapprove <- df$CC18_308a %>%
dfrecode(`1` = 1,
`2` = 1,
`3` = 0,
`4` = 0)
%>% head() df
#> # A tibble: 6 x 5
#> educ edloan employ CC18_308a trump_approve_disapprove
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2 2 5 2 1
#> 2 6 2 1 4 0
#> 3 3 2 1 4 0
#> 4 5 2 5 4 0
#> 5 3 2 1 4 0
#> 6 2 2 5 1 1
# Problem 3
# Use summarise() to create a summary table for survey respondents who are employed full time and are married. The table should have the mean and median for the importance of religion column.
#### Write your code below:
employ: Which of the following best describes your current employment status?
1 Full-time
2 Part-time
3 Temporarily laid off
4 Unemployed
5 Retired
6 Permanently disabled
7 Homemaker
8 Student
9 Other
marstat: What is you marital status?
1 Married
2 Separated
3 Divorced
4 Widowed
5 Never married
6 Domestic/civil partnership
pew_religimp: How important is religion in yuor life?
1 Very important
2 Somewhat important
3 Not too important
4 Not at all important
%>% filter(employ == 1 & marstat == 1) %>%
dat summarise(`Mean Importance of Religion` = mean(pew_religimp),
`Median Importance of Religion` = median(pew_religimp))
#> # A tibble: 1 x 2
#> `Mean Importance of Religion` `Median Importance of Religion`
#> <dbl> <dbl>
#> 1 2.19 2