# Chapter 3 Working with Data Frames

## 3.1 subset and filter

### 3.1.1 by variable (columns)

• DF <- filter(DF, VARIABLE == "CONDITION") filters based on a condition
• DF <- DF[c("VARIABLE1", "VARIABLE2", "…")] filters based on variable names
• DF <- subset(DF, select=c("VARIABLE1", "VARIABLE2", "…")) is another option to filters based on variables names, requires dplyerpackage

• DF <- DF[DF$VARIABLE < 100,] with the < 100 as an example, works with any other R operator ### 3.1.3 by the number of missing values • useful when analyzing sets of items: • DF$na_count <- apply(is.na(DF), 1, sum) creates the variable na_count with the number of NAs
• DF_noNAs <- DF[DF$na_count < 100,] creates a new DF_noNAs based on the NAs counts (in the example > 100) • follows code chunk: DF$na_count <- apply(is.na(DF), 1, sum) # counts the NAs in the DF
DF <- DF[DF$na_count < 100,] # filters based on NAs counts (in the example > 100) ## 3.2 merge • DF1&2 <- full_join(DF1, DF2, by = "GROUPING_VARIABLE", suffix = c(".x", ".y")) merges all variables of DF1 and DF2: • adds missing cases on empty cells • adds a .x and .y suffix to variables from DF1 or DF2 that did not match • dplyr package is required ## 3.3 transpose • DF <- as.data.frame(t(as.matrix(DF))) transposes an entire data frame ## 3.4 re-name variables • names(DF)[names(DF) == "OLD_NAME"] <- "NEW_NAME" for a single variable • names(DF)[1] <- "NEW_NAME" where the number corresponds to the column number • for several variables simultaneously: • DF.labels <- c("OLD_NAME"="NEW_NAME", … ) creates the labels’ key • names(DF) <- as.list(DF.labels[match(names(DF), names(DF.labels))]) applies the labels’ key • follows code chunk: DF.labels <- c("OLD_NAME"="NEW_NAME", … ) # creates labels names(DF) <- as.list(DF.labels[match(names(DF), names(DF.labels))]) # applies labels • for several variables simultaneously using names in a data frame (recommended): • DF <- DF %>% rename_with(~ DF.names$newname, DF.names$oldname) with DF.names being the data frame with the names (this data frame can be the study varaible dictionary) ## 3.5 re-code variables ### 3.5.1 categorize a continuous variable • example with 3 categories DF$FACTOR_VAR <- cut(DF$CONTINUOUS_VAR,breaks=c(-Inf, x, y, Inf), labels=c(">x","x-y",">y")) with $$x$$ and $$y$$ referring to the numeric values of the continuous variable ### 3.5.2 create dummies 1. DF <- dummy_cols(DF, select_columns = "VARAIBLE") uses the dummy_cols() to create the dummies in a DF for a factor VARIABLE 2. names(DF)[names(DF) == "VARIABLE_DUMMY1"] <- "VARIABLE" is useful to rename on of the dummy variables for latter use 3. DF$VARIABLE <- NULL can be used to clean the unnecessary VARIABLES in the the DF, follows code chunk:
DF <- dummy_cols(DF, select_columns = "VARAIBLE") # create variables
names(DF)[names(DF) == "VARIABLE_DUMMY1"] <- "VARIABLE"
DF$VARIABLE <- NULL ### 3.5.3 label factors • DF$VARIABLE <- factor(DF$VARIABLE, levels=c(1,2,3), labels=c("LABEL_1", "LABEL_2", "LABEL_3")) turns variable into factor with corresponding levels and labels ### 3.5.4 replace values • DF[DF==""] <- NA replace missing values by NA on a entire data frame (same logic applies to any other specific values) • DF$VARIABLE[DF$VARIABLE==""] <- NA allows to replace values on specific variables • DF[is.na(DF)] = 0 allows to replace NAs by specific values (in the example, by 0) • DF$VARIABLE[is.na(DF\$VARIABLE)] = 0 same logic as previous but for specific variables
• replace all missing values by mean using:
for (cols in colnames(DF)) {
if (cols %in% names(DF[,sapply(DF, is.numeric)])) {
DF <-DF %>% mutate(!!cols := replace(!!rlang::sym(cols),
is.na(!!rlang::sym(cols)),
mean(!!rlang::sym(cols), na.rm=TRUE)))
}
else {
DF <- DF  %>% mutate(!!cols := replace(!!rlang::sym(cols), !!rlang::sym(cols)=="",
getmode(!!rlang::sym(cols))))
}
}