Chapter 17 Data Cleaning/Wrangling
kmeans()
only takes numeric variables, so let us now prepare our variables for this. First, let’s rename our variables so they make more sense.
colnames(survey_data)[1] <- "issue_econ"
colnames(survey_data)[2] <- "issue_race"
colnames(survey_data)[3] <- "issue_covid"
Next, let’s remove the levels we’re not interested in (like “Refused”) and then change these variables into a numeric scale.
levels(survey_data$issue_econ)[levels(survey_data$issue_econ)==" "] <- NA
levels(survey_data$issue_econ)[levels(survey_data$issue_econ)=="Refused"] <- NA
levels(survey_data$issue_econ)[levels(survey_data$issue_econ)=="Don't know"] <- NA
::count(survey_data$issue_econ) plyr
## x freq
## 1 Not at all important 16
## 2 Not too important 27
## 3 Somewhat important 257
## 4 Very important 746
## 5 <NA> 161
levels(survey_data$issue_econ)
## [1] "Not at all important" "Not too important" "Somewhat important"
## [4] "Very important"
$issue_econ <- as.numeric(survey_data$issue_econ)
survey_data
levels(survey_data$issue_race)[levels(survey_data$issue_race)==" "] <- NA
levels(survey_data$issue_race)[levels(survey_data$issue_race)=="Refused"] <- NA
levels(survey_data$issue_race)[levels(survey_data$issue_race)=="Don't know"] <- NA
::count(survey_data$issue_race) plyr
## x freq
## 1 Not at all important 114
## 2 Not too important 85
## 3 Somewhat important 262
## 4 Very important 570
## 5 <NA> 176
$issue_race <- as.numeric(survey_data$issue_race)
survey_data
levels(survey_data$issue_covid)[levels(survey_data$issue_covid)==" "] <- NA
levels(survey_data$issue_covid)[levels(survey_data$issue_covid)=="Refused"] <- NA
levels(survey_data$issue_covid)[levels(survey_data$issue_covid)=="Don't know"] <- NA
::count(survey_data$issue_covid) plyr
## x freq
## 1 Not at all important 115
## 2 Not too important 87
## 3 Somewhat important 220
## 4 Very important 619
## 5 <NA> 166
class(survey_data$issue_covid)
## [1] "factor"
$issue_covid <- as.numeric(survey_data$issue_covid)
survey_data
levels(survey_data$trumpapprove)[levels(survey_data$trumpapprove)=="Refused"] <- NA
levels(survey_data$trumpapprove)[levels(survey_data$trumpapprove)=="Don't Know"] <- NA
$trumpapprove <- factor(survey_data$trumpapprove,
survey_datalevels = c("Strongly disapprove", "Somewhat disapprove",
"Somewhat approve", "Strongly approve"))
::count(survey_data$trumpapprove) #reordering needed plyr
## x freq
## 1 Strongly disapprove 562
## 2 Somewhat disapprove 86
## 3 Somewhat approve 140
## 4 Strongly approve 389
## 5 <NA> 30
$trumpapprove <- as.numeric(survey_data$trumpapprove) survey_data
Importantly, k-means requires full observations to work. In other words, if you have any values that are NA
, then kmeans()
will not work. We can remove observations with an NA
using na.omit()
.
<- na.omit(survey_data) survey_data