Part 5 Week 2 Asynchronous

Getting Started with Data Visualization in R > Week 2

5.1 Introduction to the tidyverse

install.packages("tidyverse")

5.2 Data import and structure in the tidyverse

5.3 Filtering, selecting, recoding, renaming, and piping

5.4 Recoding, Renaming, and Calculating Columns

5.5 Grouping and summarizing data

#####Data Wrangling with the tidyverse

### install the tidyverse if you don't have it installed. You only have to do this once.
#install.packages("tidyverse")

###load the tidyverse functions #### Do this everytime you want to use tidyverse commands
library(tidyverse)

####Use read_csv instead of read.csv

#### make sure you have the file in your working directory, or use the complete file path. Use setwd() if you need to.

# setwd(")

cces <- read_csv("week2/cces_sample_coursera.csv")

#> 
#> ── Column specification ─────────────────────────────────────────────
#> cols(
#>   .default = col_double()
#> )
#> ℹ Use `spec()` for the full column specifications.

#### read_csv produces a tibble rather than a dataframe.

class(cces)

#> [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

# vignette("tibble")

##### if you need to switch back and forth between tibble and dataframe for some reason

cces_dataframe <- as.data.frame(cces)
cces_tibble <- as_tibble(cces_dataframe)

####drop rows with missing data
cces <- drop_na(cces)

##### Use the filter function
####selects only women respondents
women <- filter(cces, gender == 2)

####remember the other logical operators

# >
# <
# <=
# >=
# &
# |
# %in%
dim(cces)

#> [1] 869  25

dim(women)

#> [1] 478  25

table(cces$gender)

#> 
#>   1   2 
#> 391 478

women$gender

#>   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#>  [33] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#>  [65] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#>  [97] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [129] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [161] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [193] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [225] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [257] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [289] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [321] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [353] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [385] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [417] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [449] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

republican_women <- filter(cces, gender == 2 & pid7 > 4)

dim(republican_women)

#> [1] 154  25

table(republican_women$pid7)

#> 
#>  5  6  7 
#> 28 36 90

republican_women$gender

#>   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#>  [33] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#>  [65] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#>  [97] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [129] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2

republican_women$pid7

#>   [1] 7 5 7 5 7 7 7 7 6 7 6 6 7 7 7 7 5 7 6 5 7 6 7 5 7 5 7 6 7 7 7 7
#>  [33] 6 7 7 6 5 6 7 5 7 5 7 6 5 7 7 7 6 7 7 5 5 7 7 5 6 7 7 7 7 7 7 7
#>  [65] 5 7 6 7 5 7 7 7 7 7 7 7 5 7 7 7 7 7 7 7 6 7 7 7 7 6 6 7 6 7 6 5
#>  [97] 6 7 6 7 7 7 6 7 5 7 7 7 7 7 5 6 7 7 6 6 6 7 7 7 7 7 6 5 6 6 5 6
#> [129] 6 5 6 6 6 6 7 5 7 7 5 6 7 7 7 7 5 7 7 6 7 7 7 5 5 5

head(republican_women)

#> # A tibble: 6 x 25
#>      caseid region gender  educ edloan  race hispanic employ marstat
#>       <dbl>  <dbl>  <dbl> <dbl>  <dbl> <dbl>    <dbl>  <dbl>   <dbl>
#> 1 412873566      2      2     3      2     1        2      7       1
#> 2 416479672      3      2     2      2     1        2      5       4
#> 3 412985437      3      2     4      2     1        2      6       3
#> 4 413425778      4      2     3      2     1        2      1       5
#> 5 413859987      3      2     6      1     1        2      1       1
#> 6 412380857      4      2     6      1     1        2      1       1
#> # … with 16 more variables: pid7 <dbl>, ideo5 <dbl>,
#> #   pew_religimp <dbl>, newsint <dbl>, faminc_new <dbl>,
#> #   union <dbl>, investor <dbl>, CC18_308a <dbl>, CC18_310a <dbl>,
#> #   CC18_310b <dbl>, CC18_310c <dbl>, CC18_310d <dbl>,
#> #   CC18_325a <dbl>, CC18_325b <dbl>, CC18_325c <dbl>,
#> #   CC18_325d <dbl>

####select certain columns from the data

select(republican_women, "educ", "employ")

#> # A tibble: 154 x 2
#>    educ employ
#>   <dbl>  <dbl>
#> 1     3      7
#> 2     2      5
#> 3     4      6
#> 4     3      1
#> 5     6      1
#> 6     6      1
#> # … with 148 more rows

####combine multiple commands using piping

#x %>% f(y) is the same as f(x, y)
#y %>% f(x, ., z) is the same as f(x, y, z)

women_republicans_educ_employ <-
  cces %>% filter(gender == 2 & pid7 > 4) %>% select("educ", "employ")

####recode variables

party <-
  recode(
    cces$pid7,
    `1` = "Democrat",
    `2` = "Democrat",
    `3` = "Democrat",
    `4` = "Independent",
    `5` = "Republican",
    `6` = "Republican",
    `7` = "Republican"
  )

cces$party <- party

####rename variables

test <- rename(cces, trump_approval = CC18_308a)

test$trump_approval

#>   [1] 2 4 4 4 4 1 1 4 1 4 4 1 4 4 4 4 4 1 4 4 2 2 1 4 1 4 1 4 4 2 4 4
#>  [33] 4 4 4 4 2 4 1 4 2 1 1 3 3 1 1 4 4 4 1 4 4 3 2 3 4 4 4 4 4 4 4 4
#>  [65] 4 4 4 4 4 2 1 1 1 1 4 1 2 4 4 3 4 4 4 2 4 4 4 4 4 4 4 3 2 4 2 4
#>  [97] 1 1 4 1 4 4 1 1 1 4 1 1 1 4 4 4 1 4 4 1 4 4 4 4 1 2 4 4 4 4 4 4
#> [129] 1 4 3 4 1 1 4 4 3 1 1 4 4 1 2 4 4 4 4 3 4 3 4 4 4 1 4 4 3 1 2 2
#> [161] 4 4 4 1 4 4 3 4 4 4 1 1 4 3 1 1 4 4 2 4 3 4 4 4 4 4 4 3 3 1 4 4
#> [193] 2 4 4 4 1 2 3 1 4 4 4 4 4 1 1 3 4 2 4 4 4 4 1 1 2 1 3 1 4 2 1 4
#> [225] 4 1 4 2 3 4 1 4 1 4 1 4 3 1 4 4 4 2 1 4 2 1 1 4 2 4 2 4 1 4 1 4
#> [257] 1 4 2 1 4 4 3 2 1 4 3 4 3 1 2 4 4 1 3 2 2 1 1 4 2 2 1 4 4 4 1 3
#> [289] 4 1 4 1 1 3 4 1 4 2 4 4 1 1 1 1 4 1 2 1 3 1 1 3 4 1 2 4 4 3 4 4
#> [321] 4 3 1 2 4 4 1 4 4 4 1 4 2 4 4 4 4 4 4 4 2 2 1 4 2 2 1 4 1 2 3 4
#> [353] 2 1 3 2 2 1 2 4 1 1 3 1 4 1 1 4 4 1 4 3 1 1 1 4 1 4 2 3 2 4 4 1
#> [385] 1 1 1 4 1 4 1 2 1 1 4 1 4 1 4 1 2 4 2 2 4 4 4 1 2 4 4 1 1 4 1 1
#> [417] 1 4 2 4 2 4 1 4 4 4 1 4 4 3 4 4 4 2 4 1 3 3 4 2 4 1 4 1 4 1 4 4
#> [449] 4 1 3 1 2 3 1 3 4 1 2 1 2 3 1 4 3 4 1 2 4 1 1 2 3 2 1 4 4 4 3 4
#> [481] 1 4 4 4 1 4 1 4 4 4 2 4 4 4 2 1 1 4 3 1 2 2 1 4 2 4 4 1 1 2 4 3
#> [513] 1 4 4 1 3 4 4 1 2 4 4 4 2 4 4 2 1 2 4 2 2 2 4 4 1 3 4 4 4 2 4 1
#> [545] 2 1 2 1 3 4 2 4 4 2 2 4 2 1 1 1 4 4 4 1 4 4 1 4 2 1 2 2 2 4 1 4
#> [577] 1 4 2 4 4 4 1 3 2 4 1 4 4 4 1 4 4 4 1 4 4 4 3 4 4 1 4 4 4 4 4 1
#> [609] 1 4 4 4 3 2 4 4 4 4 1 4 4 3 1 4 1 4 4 4 4 1 4 4 2 3 1 4 4 4 4 4
#> [641] 4 1 4 2 4 1 1 4 1 4 1 2 4 4 4 1 1 4 4 2 4 3 4 4 1 4 1 4 1 2 1 4
#> [673] 2 2 4 4 4 4 4 4 2 4 2 3 4 2 4 1 4 4 4 2 4 2 4 1 2 4 1 4 4 4 1 4
#> [705] 4 1 4 1 1 4 2 4 1 4 4 2 2 2 1 4 4 4 4 2 4 1 2 4 4 2 4 1 4 4 1 4
#> [737] 2 1 2 4 4 1 4 3 4 1 4 1 2 4 4 1 1 1 4 2 4 4 1 2 2 4 4 1 2 3 3 3
#> [769] 4 4 3 1 4 1 4 1 1 3 4 4 4 3 1 4 2 4 1 2 4 2 4 4 4 3 3 1 4 4 2 2
#> [801] 4 2 1 1 4 4 1 1 3 1 3 4 1 4 4 4 3 4 4 3 1 4 1 1 4 2 1 4 4 2 4 4
#> [833] 4 4 4 4 1 4 2 4 1 4 2 1 1 1 1 4 2 1 2 1 3 2 4 1 4 4 4 1 1 2 2 2
#> [865] 2 4 4 4 4

cces <- test
cces$trump_approval

#>   [1] 2 4 4 4 4 1 1 4 1 4 4 1 4 4 4 4 4 1 4 4 2 2 1 4 1 4 1 4 4 2 4 4
#>  [33] 4 4 4 4 2 4 1 4 2 1 1 3 3 1 1 4 4 4 1 4 4 3 2 3 4 4 4 4 4 4 4 4
#>  [65] 4 4 4 4 4 2 1 1 1 1 4 1 2 4 4 3 4 4 4 2 4 4 4 4 4 4 4 3 2 4 2 4
#>  [97] 1 1 4 1 4 4 1 1 1 4 1 1 1 4 4 4 1 4 4 1 4 4 4 4 1 2 4 4 4 4 4 4
#> [129] 1 4 3 4 1 1 4 4 3 1 1 4 4 1 2 4 4 4 4 3 4 3 4 4 4 1 4 4 3 1 2 2
#> [161] 4 4 4 1 4 4 3 4 4 4 1 1 4 3 1 1 4 4 2 4 3 4 4 4 4 4 4 3 3 1 4 4
#> [193] 2 4 4 4 1 2 3 1 4 4 4 4 4 1 1 3 4 2 4 4 4 4 1 1 2 1 3 1 4 2 1 4
#> [225] 4 1 4 2 3 4 1 4 1 4 1 4 3 1 4 4 4 2 1 4 2 1 1 4 2 4 2 4 1 4 1 4
#> [257] 1 4 2 1 4 4 3 2 1 4 3 4 3 1 2 4 4 1 3 2 2 1 1 4 2 2 1 4 4 4 1 3
#> [289] 4 1 4 1 1 3 4 1 4 2 4 4 1 1 1 1 4 1 2 1 3 1 1 3 4 1 2 4 4 3 4 4
#> [321] 4 3 1 2 4 4 1 4 4 4 1 4 2 4 4 4 4 4 4 4 2 2 1 4 2 2 1 4 1 2 3 4
#> [353] 2 1 3 2 2 1 2 4 1 1 3 1 4 1 1 4 4 1 4 3 1 1 1 4 1 4 2 3 2 4 4 1
#> [385] 1 1 1 4 1 4 1 2 1 1 4 1 4 1 4 1 2 4 2 2 4 4 4 1 2 4 4 1 1 4 1 1
#> [417] 1 4 2 4 2 4 1 4 4 4 1 4 4 3 4 4 4 2 4 1 3 3 4 2 4 1 4 1 4 1 4 4
#> [449] 4 1 3 1 2 3 1 3 4 1 2 1 2 3 1 4 3 4 1 2 4 1 1 2 3 2 1 4 4 4 3 4
#> [481] 1 4 4 4 1 4 1 4 4 4 2 4 4 4 2 1 1 4 3 1 2 2 1 4 2 4 4 1 1 2 4 3
#> [513] 1 4 4 1 3 4 4 1 2 4 4 4 2 4 4 2 1 2 4 2 2 2 4 4 1 3 4 4 4 2 4 1
#> [545] 2 1 2 1 3 4 2 4 4 2 2 4 2 1 1 1 4 4 4 1 4 4 1 4 2 1 2 2 2 4 1 4
#> [577] 1 4 2 4 4 4 1 3 2 4 1 4 4 4 1 4 4 4 1 4 4 4 3 4 4 1 4 4 4 4 4 1
#> [609] 1 4 4 4 3 2 4 4 4 4 1 4 4 3 1 4 1 4 4 4 4 1 4 4 2 3 1 4 4 4 4 4
#> [641] 4 1 4 2 4 1 1 4 1 4 1 2 4 4 4 1 1 4 4 2 4 3 4 4 1 4 1 4 1 2 1 4
#> [673] 2 2 4 4 4 4 4 4 2 4 2 3 4 2 4 1 4 4 4 2 4 2 4 1 2 4 1 4 4 4 1 4
#> [705] 4 1 4 1 1 4 2 4 1 4 4 2 2 2 1 4 4 4 4 2 4 1 2 4 4 2 4 1 4 4 1 4
#> [737] 2 1 2 4 4 1 4 3 4 1 4 1 2 4 4 1 1 1 4 2 4 4 1 2 2 4 4 1 2 3 3 3
#> [769] 4 4 3 1 4 1 4 1 1 3 4 4 4 3 1 4 2 4 1 2 4 2 4 4 4 3 3 1 4 4 2 2
#> [801] 4 2 1 1 4 4 1 1 3 1 3 4 1 4 4 4 3 4 4 3 1 4 1 1 4 2 1 4 4 2 4 4
#> [833] 4 4 4 4 1 4 2 4 1 4 2 1 1 1 1 4 2 1 2 1 3 2 4 1 4 4 4 1 1 2 2 2
#> [865] 2 4 4 4 4

####calculate new numeric variables

rec_sen1_01 <- recode(
  cces$CC18_310b,
  `1` = 0,
  `5` = 0,
  `2` = 1,
  `3` = 1,
  `4` = 1
)

rec_sen2_01 <- recode(
  cces$CC18_310c,
  `1` = 0,
  `5` = 0,
  `2` = 1,
  `3` = 1,
  `4` = 1
)

cces$rec_sen1_01 <- rec_sen1_01

cces$rec_sen2_01 <- rec_sen2_01

cces <- mutate(cces, know_sens = rec_sen1_01 + rec_sen2_01)
cces$know_sens

#>   [1] 2 1 2 2 2 2 2 2 2 2 2 2 0 1 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#>  [33] 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 0 2 1 1 0 2 2 2 2 1 2 1 2 2 2 2 1
#>  [65] 0 0 2 2 2 2 2 2 2 2 0 2 2 2 1 0 0 2 1 1 2 2 2 2 2 0 2 1 2 0 2 2
#>  [97] 1 2 1 2 0 2 2 1 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2
#> [129] 2 0 2 2 2 2 2 2 0 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 1 2 2 2 2 1
#> [161] 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 1 2 2 1 2 2 2 2 1 2 0 0 2 2 2 2
#> [193] 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 1 2 2 2 1 1 2 2 2 2 2 2 2 1 2 2 2
#> [225] 2 2 2 2 2 1 2 2 0 2 2 2 2 1 1 1 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 1
#> [257] 2 2 2 2 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2
#> [289] 2 2 2 2 2 1 2 0 1 1 2 0 2 2 2 2 0 0 0 2 2 1 2 2 1 2 2 2 2 2 2 2
#> [321] 1 0 0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2
#> [353] 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 1 2 2 2 2 0 2 2 0 2 1 2 2
#> [385] 1 2 2 1 2 1 2 2 2 2 2 0 2 2 2 2 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2
#> [417] 1 2 2 2 2 1 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 2 2 2 1 0 2 2 2 2 2 2
#> [449] 2 2 2 0 2 2 2 2 2 2 2 2 0 1 2 2 2 1 2 2 2 2 1 2 2 1 2 2 2 2 2 2
#> [481] 2 2 2 2 1 1 2 2 0 2 0 2 2 2 1 2 0 2 0 1 0 2 2 2 2 2 2 2 2 2 2 0
#> [513] 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 1 2 2 2 2 2 0 2 2 2 1 1 2
#> [545] 2 2 2 2 2 2 2 2 2 2 0 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 2
#> [577] 2 2 2 1 2 1 2 2 1 2 1 2 2 2 2 2 2 0 2 1 2 1 2 2 2 2 2 2 2 2 2 2
#> [609] 2 2 2 2 1 2 2 2 2 2 2 2 1 1 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 1
#> [641] 2 2 2 2 2 2 1 1 2 2 2 1 1 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 1 2 2
#> [673] 2 2 2 0 2 2 1 2 2 2 1 2 2 0 0 2 2 2 2 1 2 0 2 2 2 2 2 0 2 2 1 2
#> [705] 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 1 2 0 2 2 2 1 2 1 2 0 2 2 2 2
#> [737] 2 2 0 2 0 0 2 2 0 2 0 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 1 2 2 2 2 0
#> [769] 1 0 2 0 2 2 2 2 1 1 2 2 2 1 2 2 2 2 1 2 2 2 2 1 2 2 2 2 1 1 2 2
#> [801] 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 0 0 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2
#> [833] 1 2 2 2 2 0 2 2 2 2 0 2 2 2 1 2 2 2 0 2 2 1 1 2 2 2 2 2 2 2 2 2
#> [865] 0 2 2 2 2

##### reorder rows by column values

sorted_by_gender_and_party <- cces %>% arrange(gender, pid7)

# View(sorted_by_gender_and_party)

sorted_by_gender_and_party <- cces %>% arrange(gender, desc(pid7))

# View(sorted_by_gender_and_party)

##### add grouping to data

grouped_gender_pid7 <- cces %>% group_by(gender, pid7)
grouped_gender_pid7

#> # A tibble: 869 x 29
#> # Groups:   gender, pid7 [14]
#>      caseid region gender  educ edloan  race hispanic employ marstat
#>       <dbl>  <dbl>  <dbl> <dbl>  <dbl> <dbl>    <dbl>  <dbl>   <dbl>
#> 1 417614315      3      1     2      2     1        2      5       3
#> 2 415490556      1      2     6      2     1        1      1       1
#> 3 414351505      3      2     3      2     2        2      1       4
#> 4 411855339      1      2     5      2     6        2      5       3
#> 5 420208067      2      1     3      2     1        2      1       1
#> 6 412517331      1      1     2      2     1        2      5       5
#> # … with 863 more rows, and 20 more variables: pid7 <dbl>,
#> #   ideo5 <dbl>, pew_religimp <dbl>, newsint <dbl>,
#> #   faminc_new <dbl>, union <dbl>, investor <dbl>,
#> #   trump_approval <dbl>, CC18_310a <dbl>, CC18_310b <dbl>,
#> #   CC18_310c <dbl>, CC18_310d <dbl>, CC18_325a <dbl>,
#> #   CC18_325b <dbl>, CC18_325c <dbl>, CC18_325d <dbl>, party <chr>,
#> #   rec_sen1_01 <dbl>, rec_sen2_01 <dbl>, know_sens <dbl>

###remove grouping with ungroup
#ungroup(grouped_gender_pid7)

##### summarize the data
# ?summarise
summarise(cces,
          mean_pid7 = mean(pid7),
          mean_faminc = mean(faminc_new))

#> # A tibble: 1 x 2
#>   mean_pid7 mean_faminc
#>       <dbl>       <dbl>
#> 1      3.62        6.58

#####you could do the same thing with piping
#cces %>% summarise(mean_pid7=mean(pid7),mean_faminc=mean(faminc_new))

###when you summarise grouped data, you get summaries by group

grouped_gender <- cces %>% group_by(gender)
summarise(grouped_gender,
          mean_pid7 = mean(pid7),
          mean_faminc = mean(faminc_new))

#> # A tibble: 2 x 3
#>   gender mean_pid7 mean_faminc
#>    <dbl>     <dbl>       <dbl>
#> 1      1      3.93        7.18
#> 2      2      3.37        6.09

####combine all of this with piping if you want to look like a pro and have fewer lines of code
#cces %>% group_by(gender) %>% summarise(mean=mean(pid7))

5.6 Practices

library(tidyverse)

dat <- drop_na(read_csv(url("https://www.dropbox.com/s/uhfstf6g36ghxwp/cces_sample_coursera.csv?raw=1")))

#> 
#> ── Column specification ─────────────────────────────────────────────
#> cols(
#>   .default = col_double()
#> )
#> ℹ Use `spec()` for the full column specifications.

# Problem 1

# The object "dat" created in the assignment code will import the survey data for the assignment using read_csv, thereby creating a tibble. Using that object as your data, use select() to create a new tibble that include only the columns for educational level, whether the respondent has an educational loan, employment status, and Trump approval. Display that object. Hint: consult the codebook to identify the correct column names.

#### Write your code below:
dat %>% head()

#> # A tibble: 6 x 25
#>      caseid region gender  educ edloan  race hispanic employ marstat
#>       <dbl>  <dbl>  <dbl> <dbl>  <dbl> <dbl>    <dbl>  <dbl>   <dbl>
#> 1 417614315      3      1     2      2     1        2      5       3
#> 2 415490556      1      2     6      2     1        1      1       1
#> 3 414351505      3      2     3      2     2        2      1       4
#> 4 411855339      1      2     5      2     6        2      5       3
#> 5 420208067      2      1     3      2     1        2      1       1
#> 6 412517331      1      1     2      2     1        2      5       5
#> # … with 16 more variables: pid7 <dbl>, ideo5 <dbl>,
#> #   pew_religimp <dbl>, newsint <dbl>, faminc_new <dbl>,
#> #   union <dbl>, investor <dbl>, CC18_308a <dbl>, CC18_310a <dbl>,
#> #   CC18_310b <dbl>, CC18_310c <dbl>, CC18_310d <dbl>,
#> #   CC18_325a <dbl>, CC18_325b <dbl>, CC18_325c <dbl>,
#> #   CC18_325d <dbl>

df <- dat %>% select(educ, edloan, employ, CC18_308a)
df %>% head()

#> # A tibble: 6 x 4
#>    educ edloan employ CC18_308a
#>   <dbl>  <dbl>  <dbl>     <dbl>
#> 1     2      2      5         2
#> 2     6      2      1         4
#> 3     3      2      1         4
#> 4     5      2      5         4
#> 5     3      2      1         4
#> 6     2      2      5         1

# Problem 2

# Continuing to use the new data table you created in Problem 1, use recode() to create a new column named "trump_approve_disapprove" that recodes the column for President Trump's job approval. A value of "1" should mean that the respondent either "strongly" or "somewhat" approves of the President, and a value of 0 should mean that the respondent either "strongly" or "somewhat" DISapproves of the president. Display the resulting object. 


#### Write your code below:

CC18_308a Job approval by President Trump
Do you approve or disapprove of the way each is doing their job…
1 Strongly approve
2 Somewhat approve
3 Somewhat disapprove
4 Strongly disapprove

df$trump_approve_disapprove <- df$CC18_308a %>% 
  recode(`1` = 1,
         `2` = 1,
         `3` = 0,
         `4` = 0)

df %>% head()

#> # A tibble: 6 x 5
#>    educ edloan employ CC18_308a trump_approve_disapprove
#>   <dbl>  <dbl>  <dbl>     <dbl>                    <dbl>
#> 1     2      2      5         2                        1
#> 2     6      2      1         4                        0
#> 3     3      2      1         4                        0
#> 4     5      2      5         4                        0
#> 5     3      2      1         4                        0
#> 6     2      2      5         1                        1

# Problem 3

# Use summarise() to create a summary table for survey respondents who are employed full time and are married. The table should have the mean and median for the importance of religion column.


#### Write your code below:

employ: Which of the following best describes your current employment status?
1 Full-time
2 Part-time
3 Temporarily laid off
4 Unemployed
5 Retired
6 Permanently disabled
7 Homemaker
8 Student
9 Other

marstat: What is you marital status?
1 Married
2 Separated
3 Divorced
4 Widowed
5 Never married
6 Domestic/civil partnership

pew_religimp: How important is religion in yuor life?
1 Very important
2 Somewhat important
3 Not too important
4 Not at all important

dat %>% filter(employ == 1 & marstat == 1) %>% 
  summarise(`Mean Importance of Religion` = mean(pew_religimp), 
            `Median Importance of Religion` = median(pew_religimp))

#> # A tibble: 1 x 2
#>   `Mean Importance of Religion` `Median Importance of Religion`
#>                           <dbl>                           <dbl>
#> 1                          2.19                               2