Chapter 5 Descriptive statistics

In this chapter, you will learn how to:

  • Summarize categorical variables,
  • Summarize numeric variables,
  • Summarize variables by levels of another variable,
  • Create a “Table 1” of descriptive statistics.

We will use a subset of 250 adults from NHANES 2013-2014 (Section 1.20.2) for this chapter.

library(tidyverse)
load("Data/nhanes_subset.RData")
# The dataset is called mydat

The code below was used to create this dataset. This provides you with examples of some of the techniques we discussed in the previous chapters, along with some new functions (set.seed() and sample()). set.seed() ensures that the same random numbers are generated each time you run the code. In this case, the same random subset of 250 observation from NHANES will be selected.

NOTE: You do not actually need to run the code below if you already have nhanes_subset.RData (from your instructor or learning management system).

library(tidyverse)

# Import NHANES data
library(nhanesA)

# Demographics: https://wwwn.cdc.gov/nchs/nhanes/2013-2014/demo_h.htm
demo <- nhanes("DEMO_H") %>%
  select(SEQN, RIAGENDR, RIDAGEYR, RIDRETH1, INDHHIN2)

# Body Measures: https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/BMX_H.htm
bmx <- nhanes("BMX_H") %>%
  select(SEQN, BMXBMI, BMXWAIST)

# Cholesterol - Total: https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/TCHOL_H.htm
tchol <- nhanes("TCHOL_H") %>%
  select(SEQN, LBXTC)

# Cholesterol - Low-Density Lipoproteins (LDL) & Triglycerides:
# https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/TRIGLY_H.htm
trigly <- nhanes("TRIGLY_H") %>%
  select(SEQN, LBXTR)

# Plasma Fasting Glucose: https://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/GLU_H.htm
glu <- nhanes("GLU_H") %>%
  select(SEQN, LBXGLU)

mydat0 <- demo %>% 
  left_join(bmx) %>% 
  left_join(tchol) %>% 
  left_join(trigly) %>% 
  # Right join at last step to limit to fasting subsample
  right_join(glu)

names(mydat0)
summary(mydat0)

# Dimension before subsetting
dim(mydat0)

# Process the dataset
mydat  <- mydat0 %>% 
  # Adults only
  filter(RIDAGEYR >= 18)    %>%
  # Convert to factor
  mutate(gender = factor(RIAGENDR,
                         levels = 1:2,
                         labels = c("Male", "Female"))) %>% 
  mutate(race   = factor(RIDRETH1,
                         levels = 1:5,
                         labels = c("Mexican American",
                                    "Other Hispanic",
                                    "Non-Hispanic White",
                                    "Non-Hispanic Black",
                                    "Other"))) %>%
  # Collapse household income into a 3-level variable
  mutate(income = case_when(INDHHIN2 %in% c(1:5, 13)       ~ 1,
                            INDHHIN2 %in% 6:8              ~ 2,
                            INDHHIN2 %in% c(9, 10, 14, 15) ~ 3)) %>% 
  # Convert to factor
  mutate(income = factor(income,
                  levels = 1:3,
                  labels = c("< $25,000", "$25,000 to < $55,000", "$55,000+"))) %>% 
  # Rename variables for clarity
  rename(id      = SEQN,
         age     = RIDAGEYR,
         bmi     = BMXBMI,
         waist   = BMXWAIST,
         choles  = LBXTC,
         trigly  = LBXTR,
         glucose = LBXGLU)

# Check derivations
table(mydat$RIAGENDR, mydat$gender, exclude = NULL)
table(mydat$RIDRETH1, mydat$race,   exclude = NULL)
table(mydat$INDHHIN2, mydat$income, exclude = NULL)

# Select a subset of variables

# Set random seed so sample() gives the same result every time
set.seed(3489)

mydat <- mydat %>%
  select(id, gender, race, income, age, choles, bmi, waist, glucose, trigly)
# NOTE: We could have included this select() statement at the end of the 
#       previous pipe, but then we would not have been able to check
#       the derivations. That is why I included it as a separate step
#       after the checking.

# Select random set of IDs
SUBSET <- sample(mydat$id, 250)
mydat <- mydat %>% 
  filter(id %in% SUBSET)

# Check final data
dim(mydat)
summary(mydat)

save(mydat, file = "Data/nhanes_subset.RData")