Chapter 5 Descriptive statistics
In this chapter, you will learn how to:
- Summarize categorical variables,
- Summarize numeric variables,
- Summarize variables by levels of another variable,
- Create a “Table 1” of descriptive statistics.
We will use a subset of 250 adults from NHANES 2013-2014 (Section 1.20.2) for this chapter.
library(tidyverse)
load("Data/nhanes_subset.RData")
# The dataset is called mydat
The code below was used to create this dataset. This provides you with examples of some of the techniques we discussed in the previous chapters, along with some new functions (set.seed()
and sample()
). set.seed()
ensures that the same random numbers are generated each time you run the code. In this case, the same random subset of 250 observation from NHANES will be selected.
NOTE: You do not actually need to run the code below if you already have nhanes_subset.RData
(from your instructor or learning management system).
library(tidyverse)
# Import NHANES data
library(nhanesA)
# Demographics: https://wwwn.cdc.gov/nchs/nhanes/2013-2014/demo_h.htm
<- nhanes("DEMO_H") %>%
demo select(SEQN, RIAGENDR, RIDAGEYR, RIDRETH1, INDHHIN2)
# Body Measures: https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/BMX_H.htm
<- nhanes("BMX_H") %>%
bmx select(SEQN, BMXBMI, BMXWAIST)
# Cholesterol - Total: https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/TCHOL_H.htm
<- nhanes("TCHOL_H") %>%
tchol select(SEQN, LBXTC)
# Cholesterol - Low-Density Lipoproteins (LDL) & Triglycerides:
# https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/TRIGLY_H.htm
<- nhanes("TRIGLY_H") %>%
trigly select(SEQN, LBXTR)
# Plasma Fasting Glucose: https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/GLU_H.htm
<- nhanes("GLU_H") %>%
glu select(SEQN, LBXGLU)
<- demo %>%
mydat0 left_join(bmx) %>%
left_join(tchol) %>%
left_join(trigly) %>%
# Right join at last step to limit to fasting subsample
right_join(glu)
names(mydat0)
summary(mydat0)
# Dimension before subsetting
dim(mydat0)
# Process the dataset
<- mydat0 %>%
mydat # Adults only
filter(RIDAGEYR >= 18) %>%
# Convert to factor
mutate(gender = factor(RIAGENDR,
levels = 1:2,
labels = c("Male", "Female"))) %>%
mutate(race = factor(RIDRETH1,
levels = 1:5,
labels = c("Mexican American",
"Other Hispanic",
"Non-Hispanic White",
"Non-Hispanic Black",
"Other"))) %>%
# Collapse household income into a 3-level variable
mutate(income = case_when(INDHHIN2 %in% c(1:5, 13) ~ 1,
%in% 6:8 ~ 2,
INDHHIN2 %in% c(9, 10, 14, 15) ~ 3)) %>%
INDHHIN2 # Convert to factor
mutate(income = factor(income,
levels = 1:3,
labels = c("< $25,000", "$25,000 to < $55,000", "$55,000+"))) %>%
# Rename variables for clarity
rename(id = SEQN,
age = RIDAGEYR,
bmi = BMXBMI,
waist = BMXWAIST,
choles = LBXTC,
trigly = LBXTR,
glucose = LBXGLU)
# Check derivations
table(mydat$RIAGENDR, mydat$gender, exclude = NULL)
table(mydat$RIDRETH1, mydat$race, exclude = NULL)
table(mydat$INDHHIN2, mydat$income, exclude = NULL)
# Select a subset of variables
# Set random seed so sample() gives the same result every time
set.seed(3489)
<- mydat %>%
mydat select(id, gender, race, income, age, choles, bmi, waist, glucose, trigly)
# NOTE: We could have included this select() statement at the end of the
# previous pipe, but then we would not have been able to check
# the derivations. That is why I included it as a separate step
# after the checking.
# Select random set of IDs
<- sample(mydat$id, 250)
SUBSET <- mydat %>%
mydat filter(id %in% SUBSET)
# Check final data
dim(mydat)
summary(mydat)
save(mydat, file = "Data/nhanes_subset.RData")