Chapter 5 Tidying Data
Hello! In this tutorial, we’ll be going over how I cleaned the original (or “raw”) public opinion dataset for the first assignment. Let’s first make sure to load the packages we’ll be using: plyr
and tidyverse
. plyr
is the precursor to dplyr
, the data manipulation package in tidyverse
(though dplyr
functions are faster and easier to use in most cases, there are still a handful of situations where plyr
is still useful.
#install.packages("plyr")
library(plyr)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.2 --
## v ggplot2 3.3.6 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.9
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::arrange() masks plyr::arrange()
## x purrr::compact() masks plyr::compact()
## x dplyr::count() masks plyr::count()
## x dplyr::failwith() masks plyr::failwith()
## x dplyr::filter() masks stats::filter()
## x dplyr::id() masks plyr::id()
## x dplyr::lag() masks stats::lag()
## x dplyr::mutate() masks plyr::mutate()
## x dplyr::rename() masks plyr::rename()
## x dplyr::summarise() masks plyr::summarise()
## x dplyr::summarize() masks plyr::summarize()
<- read.csv("data/survey_covid_31117650_dataset_raw.csv") #don't forget to make sure this file is in your working directory!
covid_po str(covid_po)
## 'data.frame': 1001 obs. of 88 variables:
## $ respo : int 52 84 106 115 149 194 306 316 336 429 ...
## $ project : chr "27340-1006" "27340-1006" "27340-1006" "27340-1006" ...
## $ samptype: chr "RDD" "RDD" "RDD" "RDD" ...
## $ date8 : int 20200812 20200812 20200812 20200812 20200812 20200814 20200814 20200812 20200812 20200813 ...
## $ night : chr "1st night" "1st night" "1st night" "1st night" ...
## $ nights : int 4 4 4 4 4 4 4 4 4 4 ...
## $ tele_7 : int 1657 1656 1703 1657 1715 2034 2023 2016 2016 1639 ...
## $ survlgth: num 11.07 9.44 15.39 7.52 20.33 ...
## $ tzone : chr "Eastern" "Eastern" "Eastern" "Eastern" ...
## $ reg4 : chr "Northeast (census div 1,2 )" "South (census div 5,6,7)" "Northeast (census div 1,2 )" "Northeast (census div 1,2 )" ...
## $ nreg4 : chr "Northeast" "South" "Northeast" "Northeast" ...
## $ censdiv : chr "Middle Atlantic (NJ, NY, PA)" "South Atlantic (DE, DC, FL, GA, MD, NC, SC, VA, WV)" "New England (CT, ME, MA, NH, RI, VT)" "New England (CT, ME, MA, NH, RI, VT)" ...
## $ ncensdiv: chr "Middle Atlantic (NJ, NY, PA)" "South Atlantic (DE, DC, FL, GA, MD, NC, SC, VA, WV)" "New England (CT, ME, MA, NH, RI, VT)" "New England (CT, ME, MA, NH, RI, VT)" ...
## $ abcnum : chr "New Jersey" "DC" "Connecticut" "Connecticut" ...
## $ stcode : chr "New Jersey" "DC" "Connecticut" "Connecticut" ...
## $ msaflag : chr "MSA" "MSA" "MSA" "MSA" ...
## $ cbsatype: chr "M" "M" "M" "M" ...
## $ usr : chr "Suburban (any portion of an MSA county that is not in a central city)" "Urban (central city of an MSA)" "Suburban (any portion of an MSA county that is not in a central city)" "Urban (central city of an MSA)" ...
## $ nusr : chr "S" "U" "U" "U" ...
## $ qsex : chr "Female" "Female" "Male" "Female" ...
## $ busflag : chr "Listed" "Listed" "Listed" "Listed" ...
## $ intrace : chr "White" "White" "White" "White" ...
## $ intgend : chr "Male" "Male" "Female" "Male" ...
## $ censusr : chr " " " " " " " " ...
## $ abcsurve: int 1215 1215 1215 1215 1215 1215 1215 1215 1215 1215 ...
## $ survtype: chr "National 1,000+" "National 1,000+" "National 1,000+" "National 1,000+" ...
## $ year : int 2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
## $ qs1 : chr " " " " " " " " ...
## $ q921 : chr "Female" "Female" "Male" "Male" ...
## $ q905 : chr "Yes" "Yes" "Yes" "Yes" ...
## $ q19 : chr "Approve, STRONGLY" "Disapprove, STRONGLY" "Disapprove, STRONGLY" "Disapprove, STRONGLY" ...
## $ q19net : chr "Approve NET" "Disapprove NET" "Disapprove NET" "Disapprove NET" ...
## $ q21_1 : chr "Severe" "Less than that" "Severe" "Very severe" ...
## $ q21_1net: chr "Severe NET" "Not Severe NET" "Severe NET" "Severe NET" ...
## $ q21_2 : chr "Moderate" "Less than that" "Less than that" "Very severe" ...
## $ q21_2net: chr "Not Severe NET" "Not Severe NET" "Not Severe NET" "Severe NET" ...
## $ q22_2 : chr "About the same" "Better" "Better" "Better" ...
## $ q23 : chr "Very worried" "Somewhat worried" "Somewhat worried" "Very worried" ...
## $ q23net : chr "More worried NET" "More worried NET" "More worried NET" "More worried NET" ...
## $ q24 : chr "Mostly under control" "Not at all under control" "Not at all under control" "Not at all under control" ...
## $ q24net : chr "Under control NET" "Not under control NET" "Not under control NET" "Not under control NET" ...
## $ q901 : chr "A Republican" "A Democrat " "A Democrat " "A Democrat " ...
## $ q901oe. : chr " " " " " " " " ...
## $ q904 : chr " " " " " " " " ...
## $ partlean: chr "a Republican" "a Democrat" "a Democrat" "a Democrat" ...
## $ q908a : chr "Moderate" "Moderate" "Moderate" "Moderate" ...
## $ q908b : chr " " " " " " " " ...
## $ q908c : chr " " " " " " " " ...
## $ ideo5 : chr "moderate" "moderate" "moderate" "moderate" ...
## $ q909 : chr "Some college (ASK IF TECHNICAL SCHOOL; IF YES, PUNCH CODE 3, FOR HIGH SCHOOL)" "Some college (ASK IF TECHNICAL SCHOOL; IF YES, PUNCH CODE 3, FOR HIGH SCHOOL)" "Graduated College" "Graduated high school" ...
## $ q909a : chr " " " " "Bachelors degree" " " ...
## $ edubreak: chr "some college +" "some college +" "some college +" "graduated high school" ...
## $ colleduc: chr "no college degree" "no college degree" "college degree" "no college degree" ...
## $ educnew : chr "some college" "some college" "college degree" "high school or less" ...
## $ q910 : chr "77" "72" "72" "49" ...
## $ q910a : chr " " " " " " " " ...
## $ agebreak: chr "65+" "65+" "65+" "40-49" ...
## $ q911 : chr "Catholic/Roman Catholic" "Baptist" "Baptist" "Atheist" ...
## $ q911sup : chr " " " " " " " " ...
## $ q911n : chr " " " " " " " " ...
## $ q911aa : chr " " " " " " " " ...
## $ q911aa. : chr " " " " " " " " ...
## $ q911a : chr " " " " " " " " ...
## $ q911b : chr "No" "No" "Yes" " " ...
## $ relnet : chr "Catholic" "Protestant" "Protestant" "none" ...
## $ q918 : chr "White" "Black" "Black" "White" ...
## $ q918x : chr " " " " " " " " ...
## $ wep : chr " " " " " " " " ...
## $ hisprace: chr "Non-Hispanic White" "Non-Hispanic Black" "Non-Hispanic Black" "Non-Hispanic White" ...
## $ whtsxcol: chr "White women without a college degree" " " " " "White men without a college degree" ...
## $ racenet : chr "white" "black" "black" "white" ...
## $ income : chr "100 thousand or more" "50 to under 75 thousand" "35 to under 50 thousand" "75 to under 100 thousand" ...
## $ income2 : chr "100 to under 150 thousand" " " " " " " ...
## $ q920a : chr "Yes" "Yes" "Yes" "Yes" ...
## $ qd1 : chr " " " " " " " " ...
## $ qd1a1 : chr "Gave zip code" "Gave zip code" "Gave zip code" "Gave zip code" ...
## $ l1 : chr "Yes, respondent or someone in household has cell phone" "Yes, respondent or someone in household has cell phone" "Yes, respondent or someone in household has cell phone" "Yes, respondent or someone in household has cell phone" ...
## $ c1 : chr " " " " " " " " ...
## $ phonstat: chr "Dual Service" "Dual Service" "Dual Service" "Dual Service" ...
## $ q924 : chr "female" "female" "male" "male" ...
## $ q924net : chr "Female" "Female" "Male" "Male" ...
## $ recontac: chr "OK to recontact" "OK to recontact" "OK to recontact" "OK to recontact" ...
## $ incent : chr " " " " " " " " ...
## $ qd2 : chr " " " " " " " " ...
## $ stcode2 : chr "NJ" "DC" "CT" "CT" ...
## $ lang : chr "English" "English" "English" "English" ...
## $ partycon: chr "Democrat-controlled districts" " " "Democrat-controlled districts" "Democrat-controlled districts" ...
## $ weight : num 0.849 0.75 0.37 1.524 0.625 ...
Some of these functions are a repeat of what you would have learned in the dplyr
tutorial.
5.1 Select
As you can see, the dataset has many (88) variables. Let’s focus on the ones we’re really interested in by using the select()
function in the dplyr
package. select()
takes the following structure: select(<data>, <some way of telling the computer what variables to select>)
. There are actually quite a few ways to do this (you can check them out in ?select
, but the one we’ll use here is the variable name (so, the code will look like this: select(<data>, c(<variable1>, <variable2>, ..))
.
<- select(covid_po, c(q19, q21_1, q21_2, q23, q24, q901, educnew)) covid_po_selected
5.2 Filter
Sometimes, instead of selecting a specific set of columns, you want to filter by a condition in a row. You can do this using the filter()
function. The filter()
function uses the following structure: filter(<data>, <some logical condition involving a variable>)
. In order to use this function, we need to know the variable we’re interested in and how to filter what we want. For example, maybe you only want to look at female responses. In this example, we can use the intgend
variable to filter our data.
We can use table()
to learn more about this specific variable.
table(covid_po$intgend)
##
## Female Male
## 537 464
This result shows that we have 537 female-identifying participants and 464 male-identifying participants. So, when we use filter()
we should see 537 rows.
After the dataset, the second argument is a logical expression. Logical expression have “relational operators” like >
(greater than) or <=
(less than). ==
means “equal” and “!=”. The output of a logical expression is a binary. So, if we want to filter by rows where the gender variable (intgend
) is “Female”, we would use the logical expression intgend == "Female"
. Let’s just see what happens when we use this logical expression on its own with the first intgend
variable in the data frame:
$intgend[1] == "Female" covid_po
## [1] FALSE
Notice that this returns FALSE
. This means that the first participant is “Male”.
$intgend[1] == "Male" covid_po
## [1] TRUE
Let’s now put this all together:
<- filter(covid_po, intgend == "Female")
covid_po_female nrow(covid_po_female)
## [1] 537
As you can see when I use the nrow()
function (nrow()
returns the number of rows in a data frame), all the “Female” rows have been filtered into the new data frame, covid_po_female
.
5.3 Subset
You can also use subset()
in base R, which is a generic function for selecting values (or rows in a data frame) based on a criterion. The subset()
function also works on one-dimensional structures (vectors and lists).
While filter()
and select()
are both useful, subset()
remains quite popular because you can subset and filter simultaneously. Aside from the dataset, subset()
takes two common arguments, a logical expression to indicate the variable you want to filter by and a select
argument in case you want to subset to a specific set of variables. Let’s see how this is used together:
<- subset(covid_po, intgend == "Female", select = c(q19, q21_1, q21_2, q23, q24, q901, intgend, educnew))
covid_subset #new_object <- subset(old_object, logical expression, variable selection)
One important thing to note is that the select
argument takes a list, (c(<variable 1>, <variable 2>, <...>)
).
Let’s now try this again with the stcode
variable, which is a state variable. The logical expression stcode == "Texas"
means that any row in which stcode
is “Texas” will be counted asTRUE
and therefore included in the dataset. Rows in which stcode
is not “Texas” (like the first 25 rows) will be counted as FALSE
and excluded in the subsetted dataset.
table(covid_po$stcode)
##
## Alabama Alaska Arizona Arkansas California
## 20 4 18 9 112
## Colorado Connecticut DC Delaware Florida
## 10 9 9 2 53
## Georgia Hawaii Idaho Illinois Indiana
## 35 2 16 40 20
## Iowa Kansas Kentucky Louisiana Maine
## 6 5 16 18 2
## Maryland Massachusetts Michigan Minnesota Mississippi
## 21 26 33 15 8
## Missouri Montana Nebraska Nevada New Hampshire
## 22 4 7 12 3
## New Jersey New Mexico New York North Carolina North Dakota
## 29 3 60 25 3
## Ohio Oklahoma Oregon Pennsylvania Rhode Island
## 38 6 20 49 5
## South Carolina South Dakota Tennessee Texas Utah
## 20 4 24 80 6
## Vermont Virginia Washington West Virginia Wisconsin
## 3 20 26 7 15
## Wyoming
## 1
<- subset(covid_po, stcode == "Texas", select = c(q19, q21_1, q21_2, q23, q24, q901, educnew))
covid_texan_subset #your covid_texan_subset should have 80 rows.
5.4 Recoding
Sometimes, you need to recode a variable and correct some information within it. This is especially common in survey data and other types of “dirty” data.
One way I check to see if a variable needs to be cleaned is to use the table()
function, to see what is in the variable. In addition to table()
I often use str()
to identify the class of the variable.
table(covid_po$q19) #trump handling covid
##
## (VOL) DK/No Opinion (VOL) NA/Refused Approve, SOMEWHAT
## 20 4 121
## Approve, STRONGLY Disapprove, SOMEWHAT Disapprove, STRONGLY
## 273 68 515
str(covid_po$q19)
## chr [1:1001] "Approve, STRONGLY" "Disapprove, STRONGLY" ...
table(covid_po_selected$q19) #trump handling covid
##
## (VOL) DK/No Opinion (VOL) NA/Refused Approve, SOMEWHAT
## 20 4 121
## Approve, STRONGLY Disapprove, SOMEWHAT Disapprove, STRONGLY
## 273 68 515
table(covid_po_selected$q21_1) #economic impact of covid
##
## (VOL) DK/No Opinion (VOL) NA/Refused Less than that Moderate
## 15 1 71 330
## Severe Very severe
## 376 208
table(covid_po_selected$q21_2) #economic impact of covid
##
## (VOL) DK/No Opinion (VOL) NA/Refused Less than that Moderate
## 13 3 412 345
## Severe Very severe
## 135 93
table(covid_po_selected$q23) #worried of family getting covid
##
## (VOL) DK/No opinion
## 1
## (VOL) Respondent or family member has caught it
## 50
## Not too worried
## 156
## Not worried at all
## 147
## Somewhat worried
## 359
## Very worried
## 288
table(covid_po_selected$q24) #is covid under control
##
## (VOL) DK/No opinion (VOL) NA/Refused Completely under control
## 17 3 31
## Mostly under control Not at all under control Somewhat under control
## 113 470 367
table(covid_po_selected$q901) #partisanship
##
## (VOL) DK/No Opinion (VOL) NA/Refused A Democrat A Republican
## 21 10 305 285
## An Independent Or what? (SPECIFY)
## 346 34
table(covid_po_selected$educnew) #education
##
## college degree high school or less
## 8 290 258
## post-graduate degree some college
## 182 263
Notice that when I use table()
, I find that many of the variables that mean NA
but are read like a character or factor (like (VOL) NA/Refused
). These need to be recoded so R can recognize it as a missing variable.
Recoding refers to the process of re-organizing the values of a variable so that it can be used for subsequent statistical analysis. Recording is an essential (albeit tedious) process that varies from dataset to dataset. In this example (and with public opinion data generally), we often run into the problem of having to recode responses in words (e.g., “strongly agree”, “agree”, “disagree”, “strongly disagree”) into numbers.
To recode our data correctly, we’ll have to do a couple of things:
- Convert the variable into a factor
- Remove factor levels we won’t be using (like
(VOL) NA/Refused
). - Re-organize some of the factor variables’ order (this is called releveling).
- Transform the factor variables into numeric variables.
In the following section, I’ll walk through each step and show different ways of doing them. Here, I’ll focus just on q19
, but you can try each method on different variables.
5.5 Converting Variable
If we use str()
, we’d notice that this variable is a character-type. However, we can use the as.factor()
function to coerce the character into a factor.
str(covid_po$q19)
## chr [1:1001] "Approve, STRONGLY" "Disapprove, STRONGLY" ...
$q19 <- as.factor(covid_po$q19)
covid_postr(covid_po$q19)
## Factor w/ 6 levels "(VOL) DK/No Opinion",..: 4 6 6 6 4 4 4 4 4 6 ...
Notice in the code above, I replace the old covid_po$q19
variable (which was the character) with the as.factor()
version. Sometimes, it is not a big deal to rewrite another variable. But sometimes, it can be, so make sure you watch out for this.
With factors, each possible option (like “Approve, STRONGLY”) is called a “level”. Often, there are levels we don’t use or need to convert to NA
. Let us proceed with this step now.
5.6 Removing Levels
There are a couple different ways we can remove the levels. First, we’ll learn the base R strategy using the function levels()
. levels()
tells you the levels of a factor variable. levels(covid_po$q19)
will return a vector of the level names. In order to use levels()
the variable/column/list/array needs to be a factor-type (it cannot be a character or numeric)
Building on top of this, if you write levels(covid_po$q19)=="(VOL) NA/Refused"
, it will return a vector of binaries in which only the second binary. You can use this information to identity all rows where the covid_po$q19
variable is "(VOL) NA/Refused"
by placing it in brackets. For example, the following line of code replaces all rows which originally had the level "(VOL) NA/Refused"
into NA
(the R way of identifying a missing variable):
levels(covid_po$q19)[levels(covid_po$q19)=="(VOL) NA/Refused"] <- NA
The “spoken” way to interpret this code would be: in the brackets, identify all the rows in which q_19
is "(VOL) NA/Refused"
. Then replace the variable levels of these rows (this is the information outside of the brackets) with NA
(hence the <- NA
at the end). Let’s see what this looks like in code:
$q19_base <- as.factor(covid_po_selected$q19) #first, we save our re-coded variable in a new variable
covid_po_selectedlevels(covid_po_selected$q19_base)[levels(covid_po_selected$q19_base)=="(VOL) NA/Refused"] <- NA #removes this level
levels(covid_po_selected$q19_base)[levels(covid_po_selected$q19_base)=="(VOL) DK/No Opinion"] <- NA #removes this level
table(covid_po_selected$q19_base) #checks the variable
##
## Approve, SOMEWHAT Approve, STRONGLY Disapprove, SOMEWHAT
## 121 273 68
## Disapprove, STRONGLY
## 515
::count(covid_po_selected$q19_base) #checks the variable plyr
## x freq
## 1 Approve, SOMEWHAT 121
## 2 Approve, STRONGLY 273
## 3 Disapprove, SOMEWHAT 68
## 4 Disapprove, STRONGLY 515
## 5 <NA> 24
You can use this same process to rename variables.
For example, the line levels(covid_po$q19_base)[levels(covid_po$q19_base)=="(VOL) NA/Refused"] <- "HELLO WORLD"
would replace (VOL) NA/Refused
with HELLO WORLD
.
Another method would be to use the plyr
package, which contains the revalue()
function. revalue()
is a pretty straight-forward function: for factor levels you want to change, include them into the second argument of revalue()
(the first argument would be the factor [or character] variable).
$q19_revalue <- plyr::revalue(covid_po_selected$q19, c("(VOL) NA/Refused" = NA,
covid_po_selected"(VOL) DK/No Opinion" = NA))
table(covid_po_selected$q19_revalue)
##
## Approve, SOMEWHAT Approve, STRONGLY Disapprove, SOMEWHAT
## 121 273 68
## Disapprove, STRONGLY
## 515
Notice that the results of this process should mirror the results of your base R method. If you want to revalue several levels (as I do above), your second argument needs to be a vector (c()
) that includes all the factor levels you want to change.
Finally, we come to the dplyr
strategy. There are two important functions we will use here: mutate()
and recode()
. With mutate()
, we can create or change variables in a data frame. With recode
, we can replace old factor levels with NA
–or, in this case, NA_character_
, which is a special variant of NA
for strings and character (keep in mind that the factor variables’ levels are names in characters, hence why we use NA_character_
).
Note here that I use dplyr::mutate()
(you may have noticed I also did this for revalue()
above). This is because there is a mutate()
function in both plyr
and dplyr
. To make sure R uses the correct function from the correct package, you can qualify (or elaborate) on the function with <package name>::
.
<- covid_po_selected %>% #I am going to take this data frame AND THEN...
covid_po_selected ::mutate(q19_recode = dplyr::recode(q19, `(VOL) NA/Refused` = NA_character_, #I recode one variable...
dplyr`(VOL) DK/No Opinion` = NA_character_)) #...and a 2nd variable
table(covid_po_selected$q19_recode)
##
## Approve, SOMEWHAT Approve, STRONGLY Disapprove, SOMEWHAT
## 121 273 68
## Disapprove, STRONGLY
## 515
By default (that is, if you don’t qualify the function with the package name), R will use the function from the last package that you loaded into the library. Therefore, if you ran library(plyr)
after library(dplyr)
, R will automatically use the plyr()
functions. In this tutorial, I had you load library(tidyverse)
(which includes dplyr
after library(plyr)
, and so the dplyr
functions are default (in this instance, dplyr::
is not necessary, since that is the current default given how I have loaded my packages, but it is something worth keeping in mind in the future).
You could just as easily use recode()
without mutate()
, but mutate()
allows you to do additional things with the dataset after you have mutated the individual variable, especially when used in tandem with pipes like below.
#without pipes
$q19_recode <- recode(covid_po_selected$q19, `(VOL) NA/Refused` = NA_character_, `(VOL) DK/No Opinion` = NA_character_)
covid_po_selected<- subset(covid_po_selected, q901 == "A Republican")
covid_po_rep table(covid_po_rep$q19_recode)
##
## Approve, SOMEWHAT Approve, STRONGLY Disapprove, SOMEWHAT
## 60 169 11
## Disapprove, STRONGLY
## 41
#with pipes
%>% #use this dataset
covid_po_selected mutate(q19_recode = recode(q19, `(VOL) NA/Refused` = NA_character_,
`(VOL) DK/No Opinion` = NA_character_)) %>% #and then mutate the q19 variable in this dataset
subset(q901 == "A Republican", select = q19_recode) %>% #and then subset the dataset and select the variable of interest
table() #and then construct a table
## q19_recode
## Approve, SOMEWHAT Approve, STRONGLY Disapprove, SOMEWHAT
## 60 169 11
## Disapprove, STRONGLY
## 41
5.7 Releveling
Now that we have eliminated the factor levels we don’t need, let’s move onto re-arranging the levels.
All factor variables have an implicit order that is primarily relevant in two situations. First, when constructing dummy variables, the first level of a factor levels becomes the default baseline. Second, when converting a factor variable into a numeric variable, R relies on the order of the levels to determine what number to assign to each level (so if the order of the levels is “strongly disagree”, “strongly agree”, “agree”, “disagree”, “strongly disagree” will be treated as a 1 and “strongly agree” as a 2 by default without re-leveling).
The best way to change the levels of factor variables is to use the forcats()
package in R (forecats
is shorthand for “for cat[egorical variable]s”). Below, I’ll illustrate two popular functions, fct_rev()
, which reverses the level order, and fct_relevel()
which is a “manual” re-arrangement of levels.
table(covid_po_selected$q19_recode)
##
## Approve, SOMEWHAT Approve, STRONGLY Disapprove, SOMEWHAT
## 121 273 68
## Disapprove, STRONGLY
## 515
$q19_recode <- fct_rev(covid_po_selected$q19_recode)
covid_po_selectedtable(covid_po_selected$q19_recode)
##
## Disapprove, STRONGLY Disapprove, SOMEWHAT Approve, STRONGLY
## 515 68 273
## Approve, SOMEWHAT
## 121
Note here that I saved over the original variable, so the original order is lost.
Reversing the order of factor levels is also do-able in base R using the factor()
function to isolate the factors and the rev()
function to reverse the order of the levels, as seen below:
table(covid_po_selected$q19_base)
##
## Approve, SOMEWHAT Approve, STRONGLY Disapprove, SOMEWHAT
## 121 273 68
## Disapprove, STRONGLY
## 515
$q19_base <- factor(covid_po_selected$q19_base, levels=rev(levels(covid_po_selected$q19_base)))
covid_po_selectedtable(covid_po_selected$q19_base)
##
## Disapprove, STRONGLY Disapprove, SOMEWHAT Approve, STRONGLY
## 515 68 273
## Approve, SOMEWHAT
## 121
In many instances, however, reversing the order of the levels may not be enough. Sometimes yuo have to re-arrange the levels completely. With fct_relevel()
the programmer has much more control over the order that the levels should be, but the programmer then nedds to make this order explicit.
$q19_recode <- forcats::fct_relevel(covid_po_selected$q19_recode, "Disapprove, STRONGLY",
covid_po_selected"Disapprove, SOMEWHAT",
"Approve, SOMEWHAT",
"Approve, STRONGLY")
table(covid_po_selected$q19_recode)
##
## Disapprove, STRONGLY Disapprove, SOMEWHAT Approve, SOMEWHAT
## 515 68 121
## Approve, STRONGLY
## 273
Note that these levels (like all characters and strings in R) are case sensitive.
5.8 Coercing Data Types
Okay! Now that we’ve eliminated factors that we arenot interested in and now that the factor levels are in the right order, let’s turn this factor variable into a numeric one. To do this, we’ll use a function you should be familiar with: as.numeric()
, which coerces other value types into numerics.
table(covid_po_selected$q19_recode)
##
## Disapprove, STRONGLY Disapprove, SOMEWHAT Approve, SOMEWHAT
## 515 68 121
## Approve, STRONGLY
## 273
$q19_as_numeric <- as.numeric(covid_po_selected$q19_recode) #creates a new variable of q19 as a numeric
covid_po_selectedtable(covid_po_selected$q19_as_numeric)
##
## 1 2 3 4
## 515 68 121 273
Notice that the order is the one you have releveled.
5.9 Combining These Steps
To repeat, this process is: remove unused factor levels, relevel the levels in the factor vector, and then coerce the factor vector into a numeric vector. In fact, you can do this in one “chunk” using pipes:
table(covid_po_selected$q19)
##
## (VOL) DK/No Opinion (VOL) NA/Refused Approve, SOMEWHAT
## 20 4 121
## Approve, STRONGLY Disapprove, SOMEWHAT Disapprove, STRONGLY
## 273 68 515
$q19_numeric <- covid_po_selected$q19 %>% #takes the original q19 factor variable
covid_po_selected::revalue(c("(VOL) NA/Refused"= NA, "(VOL) DK/No Opinion" = NA)) %>% #removes these levels from the factor
plyr::fct_rev() %>% #reverses the order
forcatsas.numeric() #turns the factor into a numeric
table(covid_po_selected$q19_numeric)
##
## 1 2 3 4
## 515 68 273 121
As each variable can be differnt, it may be necessary to transform each variable individually, like in the following lines of code:
$q21_1_numeric <- covid_po_selected$q21_1 %>%
covid_po_selected::revalue(c("(VOL) NA/Refused"= NA, "(VOL) DK/No Opinion" = NA)) %>% as.numeric() plyr
## Warning in covid_po_selected$q21_1 %>% plyr::revalue(c(`(VOL) NA/Refused` =
## NA, : NAs introduced by coercion
table(covid_po_selected$q21_1_numeric) #economic impact of covid 1
## < table of extent 0 >
$q21_2_numeric <- covid_po_selected$q21_2 %>%
covid_po_selected::revalue(c("(VOL) NA/Refused"= NA, "(VOL) DK/No Opinion" = NA)) %>% as.numeric() plyr
## Warning in covid_po_selected$q21_2 %>% plyr::revalue(c(`(VOL) NA/Refused` =
## NA, : NAs introduced by coercion
table(covid_po_selected$q21_2_numeric) #economic impact of covid 2
## < table of extent 0 >
$q23_numeric <- covid_po_selected$q23 %>%
covid_po_selected::revalue(c("(VOL) DK/No opinion"= NA,
plyr"(VOL) Respondent or family member has caught it" = NA)) %>%
::fct_relevel("Not worried at all", "Not too worried", "Somewhat worried", "Very worried") %>% as.numeric()
forcatstable(covid_po_selected$q23_numeric) #worried of the family getting covid
##
## 1 2 3 4
## 147 156 359 288
$q24_numeric <- covid_po_selected$q24 %>%
covid_po_selected::revalue(c("(VOL) NA/Refused"= NA, "(VOL) DK/No opinion" = NA)) %>%
plyr::fct_relevel("Not at all under control",
forcats"Somewhat under control",
"Mostly under control",
"Completely under control") %>% as.numeric()
table(covid_po_selected$q24) #is covid under control?
##
## (VOL) DK/No opinion (VOL) NA/Refused Completely under control
## 17 3 31
## Mostly under control Not at all under control Somewhat under control
## 113 470 367
$q901 <- covid_po_selected$q901 %>%
covid_po_selected::revalue(c("(VOL) NA/Refused"= NA_character_, "(VOL) DK/No Opinion" = NA_character_,
plyr"Or what? (SPECIFY)" = "Independent or Other Party",
"An Independent" = "Independent or Other Party")) #turned these two factor levels into 1
levels(covid_po_selected$q901) <- c("A Democrat", "A Republican", "Independent or Other Party")
#In the line above, I rename the already-ordered levels manually. I had to do so because the factor level "A Democrat " had a space at the end.
#You can rename the levels of your variable with a character vector that is the same length as the number of levels in the factor variable
$partisanship <- covid_po_selected$q901 %>%
covid_po_selected::fct_relevel("Independent or Other Party", "A Republican", "A Democrat") forcats
## Warning: Unknown levels in `f`: A Democrat
table(covid_po_selected$partisanship) #partisanship
##
## Independent or Other Party A Republican
## 380 285
## A Democrat
## 305
table(covid_po_selected$educnew) #education
##
## college degree high school or less
## 8 290 258
## post-graduate degree some college
## 182 263
levels(covid_po_selected$educnew)[levels(covid_po_selected$educnew)==" "] <- NA
$educnew <- covid_po_selected$educnew %>%
covid_po_selected::fct_relevel("high school or less", "some college",
forcats"college degree", "post-graduate degree") #manual re-leveling
table(covid_po_selected$educnew) #education
##
## high school or less some college college degree
## 258 263 290
## post-graduate degree
## 182 8
Right now, the education variable (educnew
) stil has four levels. But we can use the plyr::revalue()
function to change this information (earlier, we used revalue()
to set a level as NA
. Here, we use revalue()
to rename a factor level into something else–in this case, we combine all the college levels into “college” and “high school or less” into “no college”.
$education <- covid_po_selected$educnew %>%
covid_po_selected::revalue(c("high school or less"= "no college", "some college" = "college",
plyr"college degree" = "college", "post-graduate degree" = "college")) #creates 2 levels from the original 4
table(covid_po_selected$education) #education
##
## no college college
## 258 735 8
Let’s now construct a data frame with just the variables we’ll be using.
<- subset(covid_po_selected,
covid_po_relevant select = c(q19_numeric, q21_1_numeric, q21_2_numeric,
q23_numeric, q24_numeric, partisanship, education))
Want to see what variables are in this data frame? Use colnames()
colnames(covid_po_relevant)
## [1] "q19_numeric" "q21_1_numeric" "q21_2_numeric" "q23_numeric"
## [5] "q24_numeric" "partisanship" "education"
All the variables we’re interested in are in there, which is great! But the variable names are not particularly descriptive. To fix this, let use the rename()
function in dplyr
.
<- covid_po_relevant %>%
covid_po_relevant ::rename(q19_trump = q19_numeric,
dplyrq21a_econ_community = q21_1_numeric, q21b_econ_personal = q21_2_numeric,
q23_worried_family = q23_numeric, q24_covid_under_control = q24_numeric,
partisanship = partisanship, education = education)
head(covid_po_relevant)
## q19_trump q21a_econ_community q21b_econ_personal q23_worried_family
## 1 3 NA NA 4
## 2 1 NA NA 3
## 3 1 NA NA 3
## 4 1 NA NA 4
## 5 3 NA NA 2
## 6 3 NA NA 1
## q24_covid_under_control partisanship education
## 1 3 A Republican college
## 2 1 A Democrat college
## 3 1 A Democrat college
## 4 1 A Democrat no college
## 5 2 A Republican college
## 6 4 A Republican no college
summary(covid_po_relevant)
## q19_trump q21a_econ_community q21b_econ_personal q23_worried_family
## Min. :1 Min. : NA Min. : NA Min. :1.000
## 1st Qu.:1 1st Qu.: NA 1st Qu.: NA 1st Qu.:2.000
## Median :1 Median : NA Median : NA Median :3.000
## Mean :2 Mean :NaN Mean :NaN Mean :2.829
## 3rd Qu.:3 3rd Qu.: NA 3rd Qu.: NA 3rd Qu.:4.000
## Max. :4 Max. : NA Max. : NA Max. :4.000
## NA's :24 NA's :1001 NA's :1001 NA's :51
## q24_covid_under_control partisanship education
## Min. :1.000 Independent or Other Party:380 no college:258
## 1st Qu.:1.000 A Republican :285 college :735
## Median :2.000 A Democrat :305 : 8
## Mean :1.699 NA's : 31
## 3rd Qu.:2.000
## Max. :4.000
## NA's :20
Awesome! You can now save this file using write.csv()
to save the data frame as a csv, or save()
to save the data as a .Rdata file.
write.csv(covid_po_relevant, "j381m_covid_survey_clean.csv")
As I mentioned, each dataset will come with its own data cleaning issues–no two datasets are alike in their problems. However, the more datasets you work with, the better you will get at devising a plan for cleaning the day. Like many research endeavors, it is easiest to approach data cleaning in steps as we have in this tutorial.