12 Additional Feature Engineering

12.1 Demographic Features

library('tidyverse')
#> -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
#> v ggplot2 3.3.6     v purrr   0.3.4
#> v tibble  3.1.7     v dplyr   1.0.9
#> v tidyr   1.2.0     v stringr 1.4.0
#> v readr   2.1.2     v forcats 0.5.1
#> -- Conflicts ------------------------------------------ tidyverse_conflicts() --
#> x dplyr::filter() masks stats::filter()
#> x dplyr::lag()    masks stats::lag()

source('Functions/connect_help.R')
#> Warning: `src_sqlite()` was deprecated in dplyr 1.0.0.
#> Please use `tbl()` directly with a database connection
#> This warning is displayed once every 8 hours.
#> Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

NHANES_Tables
#>  [1] "ACQ"      "ALB_CR"   "ALQ"      "ALQY"     "AUQ"      "BIOPRO"  
#>  [7] "BMX"      "BPQ"      "BPX"      "CBC"      "CDQ"      "CMV"     
#> [13] "COT"      "CRCO"     "DBQ"      "DEMO"     "DEQ"      "DIQ"     
#> [19] "DLQ"      "DPQ"      "DR1IFF"   "DR1TOT"   "DR2IFF"   "DR2TOT"  
#> [25] "DRXFCD"   "DS1IDS"   "DS1TOT"   "DS2TOT"   "DSBI"     "DSII"    
#> [31] "DSQIDS"   "DSQTOT"   "DUQ"      "DXX"      "DXXFEM"   "DXXSPN"  
#> [37] "Ds2IDS"   "ECQ"      "FASTQX"   "FERTIN"   "FETIB"    "FOLATE"  
#> [43] "FOLFMS"   "GHB"      "GLU"      "HDL"      "HEPA"     "HEPBD"   
#> [49] "HEPC"     "HEPE"     "HEQ"      "HIQ"      "HIV"      "HOQ"     
#> [55] "HSCRP"    "HSQ"      "HSV"      "HUQ"      "IHGEM"    "IMQ"     
#> [61] "INS"      "KIQ"      "LUX"      "MCQ"      "OCQ"      "OHQ"     
#> [67] "OHXDEN"   "OHXREF"   "OSQ"      "PAQ"      "PAQY"     "PBCD"    
#> [73] "PFAS"     "PFQ"      "PUQMEC"   "RHQ"      "RXQASA"   "RXQ_DRUG"
#> [79] "RXQ_RX"   "SLQ"      "SMQ"      "SMQFAM"   "SMQRTU"   "SMQSHS"  
#> [85] "SSPFAS"   "SXQ"      "TCHOL"    "TFR"      "TRIGLY"   "UCFLOW"  
#> [91] "UCM"      "UCPREG"   "UHG"      "UIO"      "UNI"      "VIC"     
#> [97] "VOCWB"    "VTQ"

\(~\)


\(~\)

# Open_NHANES_table_help('DEMO')

12.1.1 Age & Gender

AGE_GENDER_TBL <- DEMO %>%
  mutate(Gender = if_else(RIAGENDR == 2 , "Female", "Male")) %>%
  mutate(Age = if_else(is.na(RIDAGEYR), RIDAGEMN/12 , RIDAGEYR)) %>%
  select(SEQN, Age, Gender) 

AGE_GENDER_TBL %>%
  glimpse()
#> Rows: ??
#> Columns: 3
#> Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#> $ SEQN   <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, ~
#> $ Age    <dbl> 2, 77, 10, 1, 49, 19, 59, 13, 11, 43, 15, 37, 70, 81, 38, 85, 2~
#> $ Gender <chr> "Female", "Male", "Female", "Male", "Male", "Female", "Female",~

12.1.2 Race/Hispanic origin

We see that only RIDRETH1 is available in both 2017-2018 and 1999-2000:

RACE_TBL <- DEMO %>%
  select(SEQN, RIDRETH1) %>%
  mutate(Race = 
           case_when(RIDRETH1 == 1 ~ "Mexican American",
                     RIDRETH1 == 2 ~ "Other Hispanic",
                     RIDRETH1 == 3 ~ "White",
                     RIDRETH1 == 4 ~ "Black",
                     RIDRETH1 == 5 ~ "Other",
                     !(RIDRETH1 %in% c(1,2,3,4,5)) ~ NA )) %>%
  select(-RIDRETH1)

RACE_TBL %>% 
  head()
#> # Source:   SQL [6 x 2]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>    SEQN Race 
#>   <dbl> <chr>
#> 1     1 Black
#> 2     2 White
#> 3     3 White
#> 4     4 Black
#> 5     5 White
#> 6     6 Other

12.1.3 US Armed Forces

We see that only DMQMILIZ is available since 2011-2012, prior to 2011, the veteran status information (released in the variable DMQMILIT in the demographics file in 1999-2010)

USFA_TBL <- DEMO %>%
  select(SEQN, DMQMILIZ, DMQMILIT) %>%
  mutate(USAF = 
           case_when((DMQMILIZ == 1 | DMQMILIT == 1) ~ "Yes",
                     (DMQMILIZ == 2 | DMQMILIT == 2) ~ "No",
                     !(DMQMILIZ %in% c(1,2) | DMQMILIT %in% c(1,2) ) |(is.na(DMQMILIZ) & is.na(DMQMILIT))   ~ NA )) %>%
  select(-DMQMILIZ, -DMQMILIT)

USFA_TBL %>% 
  head()
#> # Source:   SQL [6 x 2]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>    SEQN USAF 
#>   <dbl> <chr>
#> 1     1 <NA> 
#> 2     2 Yes  
#> 3     3 <NA> 
#> 4     4 <NA> 
#> 5     5 Yes  
#> 6     6 No

12.1.4 Country of Birth

Due to the concerns of disclosure risk, starting in 2011, country of birth was recoded into two categories: 1) Born in 50 U.S. states or Washington, DC; and 2) Born in other countries, including U.S. territories.

BIRTH_COUNTRY_TBL <- DEMO %>%
  select(SEQN, DMDBORN, DMDBORN2, DMDBORN4 ) %>%
  mutate(Birth_Country = 
           case_when(DMDBORN == 1 | DMDBORN2 == 1 | DMDBORN4 == 1 ~ "USA",
                     DMDBORN == 2 | DMDBORN2 == 2 | DMDBORN4 == 2 ~ "Other",
                     (DMDBORN > 2 | DMDBORN2 > 2 | DMDBORN4 > 2 ) | 
                       (is.na(DMDBORN) & is.na(DMDBORN2) & is.na(DMDBORN4)) ~ NA  )) %>%
  select(SEQN, Birth_Country)

BIRTH_COUNTRY_TBL %>% 
  head()
#> # Source:   SQL [6 x 2]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>    SEQN Birth_Country
#>   <dbl> <chr>        
#> 1     1 USA          
#> 2     2 USA          
#> 3     3 <NA>         
#> 4     4 USA          
#> 5     5 USA          
#> 6     6 USA

12.1.5 Education

EDUCATION_TBL <- DEMO %>%
  select(SEQN, DMDEDUC3, DMDEDUC2) %>%
  mutate(Grade_Level = 
           case_when(DMDEDUC3 ==0~"Never attended / kindergarten only",
DMDEDUC3 ==1~"1st grade",
DMDEDUC3 ==2~"2nd grade",
DMDEDUC3 ==3~"3rd grade",
DMDEDUC3 ==4~"4th grade",
DMDEDUC3 ==5~"5th grade",
DMDEDUC3 ==6~"6th grade",
DMDEDUC3 ==7~"7th grade",
DMDEDUC3 ==8~"8th grade",
DMDEDUC3 ==9~"9th grade",
DMDEDUC3 ==10~"10th grade",
DMDEDUC3 ==11~"11th grade",
DMDEDUC3 ==12~"12th grade, no diploma",
DMDEDUC3 ==13~"High school graduate",
DMDEDUC3 ==14~"GED or equivalent",
DMDEDUC3 ==15~"More than high school",
DMDEDUC3 ==55~"Less than 5th grade",
DMDEDUC3 ==66~"Less than 9th grade",
DMDEDUC3 ==77~"Refused",
DMDEDUC3 ==99~"Don't Know",
is.na(DMDEDUC3) |  DMDEDUC3 > 66 ~ NA)) %>%
  select(-DMDEDUC3) %>%
  mutate(Grade_Range = 
           case_when(DMDEDUC2 ==1~"Less than 9th grade",
DMDEDUC2 ==2~"9-11th grade (Includes 12th grade with no diploma)",
DMDEDUC2 ==3~"High school graduate/GED or equivalent",
DMDEDUC2 ==4~"Some college or AA degree",
DMDEDUC2 ==5~"College graduate or above",
DMDEDUC2 > 5 | is.na(DMDEDUC2)  ~ NA )) %>%
  select(-DMDEDUC2)

EDUCATION_TBL %>% 
  head()
#> # Source:   SQL [6 x 3]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>    SEQN Grade_Level           Grade_Range              
#>   <dbl> <chr>                 <chr>                    
#> 1     1 <NA>                  <NA>                     
#> 2     2 <NA>                  College graduate or above
#> 3     3 3rd grade             <NA>                     
#> 4     4 <NA>                  <NA>                     
#> 5     5 <NA>                  College graduate or above
#> 6     6 More than high school <NA>

12.1.6 Marital Status


MARITAL_STATUS_TBL <- DEMO %>%
  select(SEQN, DMDMARTL) %>%
  mutate(Marital_Status = 
           case_when(DMDMARTL ==1~"Married",
DMDMARTL ==2~"Widowed",
DMDMARTL ==3~"Divorced",
DMDMARTL ==4~"Separated",
DMDMARTL ==5~"Never married",
DMDMARTL ==6~"Living with partner",
DMDMARTL >6 | is.na(DMDMARTL) ~ NA )) %>%
  select(-DMDMARTL)

MARITAL_STATUS_TBL %>% 
  head()
#> # Source:   SQL [6 x 2]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>    SEQN Marital_Status
#>   <dbl> <chr>         
#> 1     1 <NA>          
#> 2     2 <NA>          
#> 3     3 <NA>          
#> 4     4 <NA>          
#> 5     5 Married       
#> 6     6 Never married

12.1.7 Pregnancy Status

PREG_STATUS_TBL <- DEMO %>%
  select(SEQN, RIDEXPRG) %>%
  mutate(Pregnant = 
           case_when(RIDEXPRG == 1 ~ "Pregnant",
                     RIDEXPRG == 2 ~ "Not Pregnant",
                     RIDEXPRG > 2 | is.na(RIDEXPRG) ~ NA )) %>%
  select(-RIDEXPRG)

PREG_STATUS_TBL %>% 
  head()
#> # Source:   SQL [6 x 2]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>    SEQN Pregnant    
#>   <dbl> <chr>       
#> 1     1 <NA>        
#> 2     2 <NA>        
#> 3     3 <NA>        
#> 4     4 <NA>        
#> 5     5 <NA>        
#> 6     6 Not Pregnant

12.1.8 Income

INCOME_TBL <- DEMO %>%
  select(SEQN, INDHHIN2, INDFMIN2, INDFMPIR) %>%
  mutate(Household_Icome = case_when(INDHHIN2 ==1~"$ 0 to $ 4,999",
INDHHIN2 ==2~"$ 5,000 to $ 9,999",
INDHHIN2 ==3~"$10,000 to $14,999",
INDHHIN2 ==4~"$15,000 to $19,999",
INDHHIN2 ==5~"$20,000 to $24,999",
INDHHIN2 ==6~"$25,000 to $34,999",
INDHHIN2 ==7~"$35,000 to $44,999",
INDHHIN2 ==8~"$45,000 to $54,999",
INDHHIN2 ==9~"$55,000 to $64,999",
INDHHIN2 ==10~"$65,000 to $74,999",
INDHHIN2 ==12~"$20,000 and Over",
INDHHIN2 ==13~"Under $20,000",
INDHHIN2 ==14~"$75,000 to $99,999",
INDHHIN2 ==15~"$100,000 and Over",
INDHHIN2 > 15 | is.na(INDHHIN2) ~ NA)) %>%
  select(-INDHHIN2) %>%
  mutate(Family_Income = case_when(INDFMIN2 ==1~"$ 0 to $ 4,999",
INDFMIN2 ==2~"$ 5,000 to $ 9,999",
INDFMIN2 ==3~"$10,000 to $14,999",
INDFMIN2 ==4~"$15,000 to $19,999",
INDFMIN2 ==5~"$20,000 to $24,999",
INDFMIN2 ==6~"$25,000 to $34,999",
INDFMIN2 ==7~"$35,000 to $44,999",
INDFMIN2 ==8~"$45,000 to $54,999",
INDFMIN2 ==9~"$55,000 to $64,999",
INDFMIN2 ==10~"$65,000 to $74,999",
INDFMIN2 ==12~"$20,000 and Over",
INDFMIN2 ==13~"Under $20,000",
INDFMIN2 ==14~"$75,000 to $99,999",
INDFMIN2 ==15~"$100,000 and Over",
INDFMIN2 > 15 | is.na(INDFMIN2) ~ NA )) %>%
  select(-INDFMIN2) %>%
  mutate(Poverty_Income_Ratio = 
           case_when(INDFMPIR <= 5 ~ INDFMPIR,
                     is.na(INDFMPIR) ~ NA)) %>%
  select(-INDFMPIR)


INCOME_TBL %>% 
  head()
#> # Source:   SQL [6 x 4]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>    SEQN Household_Icome Family_Income Poverty_Income_Ratio
#>   <dbl> <lgl>           <lgl>                        <dbl>
#> 1     1 NA              NA                            0.86
#> 2     2 NA              NA                            5   
#> 3     3 NA              NA                            1.47
#> 4     4 NA              NA                            0.57
#> 5     5 NA              NA                            5   
#> 6     6 NA              NA                            1.21

12.1.9 Demographic Features Final

DEMO_FEATURES <- DEMO %>%
  select(SEQN) %>%
  left_join(AGE_GENDER_TBL) %>%
  left_join(RACE_TBL) %>%
  left_join(USFA_TBL) %>%
  left_join(BIRTH_COUNTRY_TBL) %>%
  left_join(EDUCATION_TBL) %>%
  left_join(MARITAL_STATUS_TBL) %>%
  left_join(PREG_STATUS_TBL) %>%
  left_join(INCOME_TBL)
#> Joining, by = "SEQN"
#> Joining, by = "SEQN"
#> Joining, by = "SEQN"
#> Joining, by = "SEQN"
#> Joining, by = "SEQN"
#> Joining, by = "SEQN"
#> Joining, by = "SEQN"
#> Joining, by = "SEQN"
DEMO_FEATURES %>%
  glimpse()
#> Rows: ??
#> Columns: 13
#> Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#> $ SEQN                 <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15~
#> $ Age                  <dbl> 2, 77, 10, 1, 49, 19, 59, 13, 11, 43, 15, 37, 70,~
#> $ Gender               <chr> "Female", "Male", "Female", "Male", "Male", "Fema~
#> $ Race                 <chr> "Black", "White", "White", "Black", "White", "Oth~
#> $ USAF                 <chr> NA, "Yes", NA, NA, "Yes", "No", "No", NA, NA, "Ye~
#> $ Birth_Country        <chr> "USA", "USA", NA, "USA", "USA", "USA", "USA", "US~
#> $ Grade_Level          <chr> NA, NA, "3rd grade", NA, NA, "More than high scho~
#> $ Grade_Range          <chr> NA, "College graduate or above", NA, NA, "College~
#> $ Marital_Status       <chr> NA, NA, NA, NA, "Married", "Never married", "Marr~
#> $ Pregnant             <chr> NA, NA, NA, NA, NA, "Not Pregnant", "Not Pregnant~
#> $ Household_Icome      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ Family_Income        <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ Poverty_Income_Ratio <dbl> 0.86, 5.00, 1.47, 0.57, 5.00, 1.21, NA, 0.53, NA,~
DEMO_FEATURES %>%
  show_query()
#> <SQL>
#> SELECT
#>   `LHS`.`SEQN` AS `SEQN`,
#>   `Age`,
#>   `Gender`,
#>   `Race`,
#>   `USAF`,
#>   `Birth_Country`,
#>   `Grade_Level`,
#>   `Grade_Range`,
#>   `Marital_Status`,
#>   `Pregnant`,
#>   `Household_Icome`,
#>   `Family_Income`,
#>   `Poverty_Income_Ratio`
#> FROM (
#>   SELECT
#>     `LHS`.`SEQN` AS `SEQN`,
#>     `Age`,
#>     `Gender`,
#>     `Race`,
#>     `USAF`,
#>     `Birth_Country`,
#>     `Grade_Level`,
#>     `Grade_Range`,
#>     `Marital_Status`,
#>     `Pregnant`
#>   FROM (
#>     SELECT
#>       `LHS`.`SEQN` AS `SEQN`,
#>       `Age`,
#>       `Gender`,
#>       `Race`,
#>       `USAF`,
#>       `Birth_Country`,
#>       `Grade_Level`,
#>       `Grade_Range`,
#>       `Marital_Status`
#>     FROM (
#>       SELECT
#>         `LHS`.`SEQN` AS `SEQN`,
#>         `Age`,
#>         `Gender`,
#>         `Race`,
#>         `USAF`,
#>         `Birth_Country`,
#>         `Grade_Level`,
#>         `Grade_Range`
#>       FROM (
#>         SELECT
#>           `LHS`.`SEQN` AS `SEQN`,
#>           `Age`,
#>           `Gender`,
#>           `Race`,
#>           `USAF`,
#>           `Birth_Country`
#>         FROM (
#>           SELECT `LHS`.`SEQN` AS `SEQN`, `Age`, `Gender`, `Race`, `USAF`
#>           FROM (
#>             SELECT `LHS`.`SEQN` AS `SEQN`, `Age`, `Gender`, `Race`
#>             FROM (
#>               SELECT `LHS`.`SEQN` AS `SEQN`, `Age`, `Gender`
#>               FROM (
#>                 SELECT `SEQN`
#>                 FROM `DEMO`
#>               ) AS `LHS`
#>               LEFT JOIN (
#>                 SELECT
#>                   `SEQN`,
#>                   CASE WHEN ((`RIDAGEYR` IS NULL)) THEN (`RIDAGEMN` / 12.0) WHEN NOT ((`RIDAGEYR` IS NULL)) THEN `RIDAGEYR` END AS `Age`,
#>                   `Gender`
#>                 FROM (
#>                   SELECT
#>                     *,
#>                     CASE WHEN (`RIAGENDR` = 2.0) THEN 'Female' WHEN NOT (`RIAGENDR` = 2.0) THEN 'Male' END AS `Gender`
#>                   FROM `DEMO`
#>                 )
#>               ) AS `RHS`
#>                 ON (`LHS`.`SEQN` = `RHS`.`SEQN`)
#>             ) AS `LHS`
#>             LEFT JOIN (
#>               SELECT
#>                 `SEQN`,
#>                 CASE
#> WHEN (`RIDRETH1` = 1.0) THEN 'Mexican American'
#> WHEN (`RIDRETH1` = 2.0) THEN 'Other Hispanic'
#> WHEN (`RIDRETH1` = 3.0) THEN 'White'
#> WHEN (`RIDRETH1` = 4.0) THEN 'Black'
#> WHEN (`RIDRETH1` = 5.0) THEN 'Other'
#> WHEN (NOT((`RIDRETH1` IN (1.0, 2.0, 3.0, 4.0, 5.0)))) THEN NULL
#> END AS `Race`
#>               FROM `DEMO`
#>             ) AS `RHS`
#>               ON (`LHS`.`SEQN` = `RHS`.`SEQN`)
#>           ) AS `LHS`
#>           LEFT JOIN (
#>             SELECT
#>               `SEQN`,
#>               CASE
#> WHEN ((`DMQMILIZ` = 1.0 OR `DMQMILIT` = 1.0)) THEN 'Yes'
#> WHEN ((`DMQMILIZ` = 2.0 OR `DMQMILIT` = 2.0)) THEN 'No'
#> WHEN (NOT((`DMQMILIZ` IN (1.0, 2.0) OR `DMQMILIT` IN (1.0, 2.0))) OR ((`DMQMILIZ` IS NULL) AND (`DMQMILIT` IS NULL))) THEN NULL
#> END AS `USAF`
#>             FROM `DEMO`
#>           ) AS `RHS`
#>             ON (`LHS`.`SEQN` = `RHS`.`SEQN`)
#>         ) AS `LHS`
#>         LEFT JOIN (
#>           SELECT
#>             `SEQN`,
#>             CASE
#> WHEN (`DMDBORN` = 1.0 OR `DMDBORN2` = 1.0 OR `DMDBORN4` = 1.0) THEN 'USA'
#> WHEN (`DMDBORN` = 2.0 OR `DMDBORN2` = 2.0 OR `DMDBORN4` = 2.0) THEN 'Other'
#> WHEN ((`DMDBORN` > 2.0 OR `DMDBORN2` > 2.0 OR `DMDBORN4` > 2.0) OR ((`DMDBORN` IS NULL) AND (`DMDBORN2` IS NULL) AND (`DMDBORN4` IS NULL))) THEN NULL
#> END AS `Birth_Country`
#>           FROM `DEMO`
#>         ) AS `RHS`
#>           ON (`LHS`.`SEQN` = `RHS`.`SEQN`)
#>       ) AS `LHS`
#>       LEFT JOIN (
#>         SELECT
#>           `SEQN`,
#>           `Grade_Level`,
#>           CASE
#> WHEN (`DMDEDUC2` = 1.0) THEN 'Less than 9th grade'
#> WHEN (`DMDEDUC2` = 2.0) THEN '9-11th grade (Includes 12th grade with no diploma)'
#> WHEN (`DMDEDUC2` = 3.0) THEN 'High school graduate/GED or equivalent'
#> WHEN (`DMDEDUC2` = 4.0) THEN 'Some college or AA degree'
#> WHEN (`DMDEDUC2` = 5.0) THEN 'College graduate or above'
#> WHEN (`DMDEDUC2` > 5.0 OR (`DMDEDUC2` IS NULL)) THEN NULL
#> END AS `Grade_Range`
#>         FROM (
#>           SELECT
#>             `SEQN`,
#>             `DMDEDUC2`,
#>             CASE
#> WHEN (`DMDEDUC3` = 0.0) THEN 'Never attended / kindergarten only'
#> WHEN (`DMDEDUC3` = 1.0) THEN '1st grade'
#> WHEN (`DMDEDUC3` = 2.0) THEN '2nd grade'
#> WHEN (`DMDEDUC3` = 3.0) THEN '3rd grade'
#> WHEN (`DMDEDUC3` = 4.0) THEN '4th grade'
#> WHEN (`DMDEDUC3` = 5.0) THEN '5th grade'
#> WHEN (`DMDEDUC3` = 6.0) THEN '6th grade'
#> WHEN (`DMDEDUC3` = 7.0) THEN '7th grade'
#> WHEN (`DMDEDUC3` = 8.0) THEN '8th grade'
#> WHEN (`DMDEDUC3` = 9.0) THEN '9th grade'
#> WHEN (`DMDEDUC3` = 10.0) THEN '10th grade'
#> WHEN (`DMDEDUC3` = 11.0) THEN '11th grade'
#> WHEN (`DMDEDUC3` = 12.0) THEN '12th grade, no diploma'
#> WHEN (`DMDEDUC3` = 13.0) THEN 'High school graduate'
#> WHEN (`DMDEDUC3` = 14.0) THEN 'GED or equivalent'
#> WHEN (`DMDEDUC3` = 15.0) THEN 'More than high school'
#> WHEN (`DMDEDUC3` = 55.0) THEN 'Less than 5th grade'
#> WHEN (`DMDEDUC3` = 66.0) THEN 'Less than 9th grade'
#> WHEN (`DMDEDUC3` = 77.0) THEN 'Refused'
#> WHEN (`DMDEDUC3` = 99.0) THEN 'Don''t Know'
#> WHEN ((`DMDEDUC3` IS NULL) OR `DMDEDUC3` > 66.0) THEN NULL
#> END AS `Grade_Level`
#>           FROM `DEMO`
#>         )
#>       ) AS `RHS`
#>         ON (`LHS`.`SEQN` = `RHS`.`SEQN`)
#>     ) AS `LHS`
#>     LEFT JOIN (
#>       SELECT
#>         `SEQN`,
#>         CASE
#> WHEN (`DMDMARTL` = 1.0) THEN 'Married'
#> WHEN (`DMDMARTL` = 2.0) THEN 'Widowed'
#> WHEN (`DMDMARTL` = 3.0) THEN 'Divorced'
#> WHEN (`DMDMARTL` = 4.0) THEN 'Separated'
#> WHEN (`DMDMARTL` = 5.0) THEN 'Never married'
#> WHEN (`DMDMARTL` = 6.0) THEN 'Living with partner'
#> WHEN (`DMDMARTL` > 6.0 OR (`DMDMARTL` IS NULL)) THEN NULL
#> END AS `Marital_Status`
#>       FROM `DEMO`
#>     ) AS `RHS`
#>       ON (`LHS`.`SEQN` = `RHS`.`SEQN`)
#>   ) AS `LHS`
#>   LEFT JOIN (
#>     SELECT
#>       `SEQN`,
#>       CASE
#> WHEN (`RIDEXPRG` = 1.0) THEN 'Pregnant'
#> WHEN (`RIDEXPRG` = 2.0) THEN 'Not Pregnant'
#> WHEN (`RIDEXPRG` > 2.0 OR (`RIDEXPRG` IS NULL)) THEN NULL
#> END AS `Pregnant`
#>     FROM `DEMO`
#>   ) AS `RHS`
#>     ON (`LHS`.`SEQN` = `RHS`.`SEQN`)
#> ) AS `LHS`
#> LEFT JOIN (
#>   SELECT
#>     `SEQN`,
#>     `Household_Icome`,
#>     `Family_Income`,
#>     CASE
#> WHEN (`INDFMPIR` <= 5.0) THEN `INDFMPIR`
#> WHEN ((`INDFMPIR` IS NULL)) THEN NULL
#> END AS `Poverty_Income_Ratio`
#>   FROM (
#>     SELECT
#>       `SEQN`,
#>       `INDFMPIR`,
#>       `Household_Icome`,
#>       CASE
#> WHEN (`INDFMIN2` = 1.0) THEN '$ 0 to $ 4,999'
#> WHEN (`INDFMIN2` = 2.0) THEN '$ 5,000 to $ 9,999'
#> WHEN (`INDFMIN2` = 3.0) THEN '$10,000 to $14,999'
#> WHEN (`INDFMIN2` = 4.0) THEN '$15,000 to $19,999'
#> WHEN (`INDFMIN2` = 5.0) THEN '$20,000 to $24,999'
#> WHEN (`INDFMIN2` = 6.0) THEN '$25,000 to $34,999'
#> WHEN (`INDFMIN2` = 7.0) THEN '$35,000 to $44,999'
#> WHEN (`INDFMIN2` = 8.0) THEN '$45,000 to $54,999'
#> WHEN (`INDFMIN2` = 9.0) THEN '$55,000 to $64,999'
#> WHEN (`INDFMIN2` = 10.0) THEN '$65,000 to $74,999'
#> WHEN (`INDFMIN2` = 12.0) THEN '$20,000 and Over'
#> WHEN (`INDFMIN2` = 13.0) THEN 'Under $20,000'
#> WHEN (`INDFMIN2` = 14.0) THEN '$75,000 to $99,999'
#> WHEN (`INDFMIN2` = 15.0) THEN '$100,000 and Over'
#> WHEN (`INDFMIN2` > 15.0 OR (`INDFMIN2` IS NULL)) THEN NULL
#> END AS `Family_Income`
#>     FROM (
#>       SELECT
#>         `SEQN`,
#>         `INDFMIN2`,
#>         `INDFMPIR`,
#>         CASE
#> WHEN (`INDHHIN2` = 1.0) THEN '$ 0 to $ 4,999'
#> WHEN (`INDHHIN2` = 2.0) THEN '$ 5,000 to $ 9,999'
#> WHEN (`INDHHIN2` = 3.0) THEN '$10,000 to $14,999'
#> WHEN (`INDHHIN2` = 4.0) THEN '$15,000 to $19,999'
#> WHEN (`INDHHIN2` = 5.0) THEN '$20,000 to $24,999'
#> WHEN (`INDHHIN2` = 6.0) THEN '$25,000 to $34,999'
#> WHEN (`INDHHIN2` = 7.0) THEN '$35,000 to $44,999'
#> WHEN (`INDHHIN2` = 8.0) THEN '$45,000 to $54,999'
#> WHEN (`INDHHIN2` = 9.0) THEN '$55,000 to $64,999'
#> WHEN (`INDHHIN2` = 10.0) THEN '$65,000 to $74,999'
#> WHEN (`INDHHIN2` = 12.0) THEN '$20,000 and Over'
#> WHEN (`INDHHIN2` = 13.0) THEN 'Under $20,000'
#> WHEN (`INDHHIN2` = 14.0) THEN '$75,000 to $99,999'
#> WHEN (`INDHHIN2` = 15.0) THEN '$100,000 and Over'
#> WHEN (`INDHHIN2` > 15.0 OR (`INDHHIN2` IS NULL)) THEN NULL
#> END AS `Household_Icome`
#>       FROM `DEMO`
#>     )
#>   )
#> ) AS `RHS`
#>   ON (`LHS`.`SEQN` = `RHS`.`SEQN`)

\(~\)


\(~\)

12.2 Lab Features

# browseURL('https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Laboratory&CycleBeginYear=2017')
ALB_CR <- tbl(NHANES_DB, "ALB_CR")
HDL <- tbl(NHANES_DB, 'HDL')
TRIGLY <- tbl(NHANES_DB, 'TRIGLY')
TCHOL <- tbl(NHANES_DB, "TCHOL")
INS <- tbl(NHANES_DB,"INS") 
GLU <- tbl(NHANES_DB, 'GLU')
UCPREG <- tbl(NHANES_DB,"UCPREG")

12.2.1 Mapping Column Issues

prep_LABS_TBL <- DEMO %>%
  select(SEQN, yr_range) %>%
  left_join(ALB_CR) %>%
  left_join(HDL) %>%
  left_join(TRIGLY) %>%
  left_join(TCHOL) %>%
  left_join(INS, 
            by=c("SEQN", "yr_range"),
            suffix=c('.TCHOL',''))  %>%
  left_join(GLU, 
            by=c("SEQN", "yr_range"),
            suffix=c('.INS','.GLU')) %>%
  left_join(UCPREG)
#> Joining, by = c("SEQN", "yr_range")
#> Joining, by = c("SEQN", "yr_range")
#> Joining, by = c("SEQN", "yr_range")
#> Joining, by = c("SEQN", "yr_range")
#> Joining, by = c("SEQN", "yr_range")
prep_LABS_TBL %>%
  glimpse()
#> Rows: ??
#> Columns: 43
#> Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#> $ SEQN           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, ~
#> $ yr_range       <chr> "1999-2000", "1999-2000", "1999-2000", "1999-2000", "19~
#> $ URXUMA         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URXUMS         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URXUCR         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URXCRS         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URDACT         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URXUMA2        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URDUMA2S       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URXUCR2        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URDUCR2S       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URDACT2        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URDUMALC       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URDUCRLC       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDHDD         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDHDDSI       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ WTSAF2YR.TCHOL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBXTR          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDTRSI        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDLDL         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDLDLSI       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBXAPB         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDAPBSI       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDLDLM        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDLDMSI       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDLDLN        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDLDNSI       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBXTC          <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDTCSI        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ WTSAF2YR.INS   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBXIN.INS      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDINSI.INS    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ PHAFSTHR.INS   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ PHAFSTMN.INS   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDINLC        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ WTSAF2YR.GLU   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBXGLU         <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDGLUSI       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBXIN.GLU      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDINSI.GLU    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ PHAFSTHR.GLU   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ PHAFSTMN.GLU   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ URXPREG        <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
lab_features <- colnames(prep_LABS_TBL)[!colnames(prep_LABS_TBL) %in% c('SEQN','yr_range')] 
lab_features
#>  [1] "URXUMA"         "URXUMS"         "URXUCR"         "URXCRS"        
#>  [5] "URDACT"         "URXUMA2"        "URDUMA2S"       "URXUCR2"       
#>  [9] "URDUCR2S"       "URDACT2"        "URDUMALC"       "URDUCRLC"      
#> [13] "LBDHDD"         "LBDHDDSI"       "WTSAF2YR.TCHOL" "LBXTR"         
#> [17] "LBDTRSI"        "LBDLDL"         "LBDLDLSI"       "LBXAPB"        
#> [21] "LBDAPBSI"       "LBDLDLM"        "LBDLDMSI"       "LBDLDLN"       
#> [25] "LBDLDNSI"       "LBXTC"          "LBDTCSI"        "WTSAF2YR.INS"  
#> [29] "LBXIN.INS"      "LBDINSI.INS"    "PHAFSTHR.INS"   "PHAFSTMN.INS"  
#> [33] "LBDINLC"        "WTSAF2YR.GLU"   "LBXGLU"         "LBDGLUSI"      
#> [37] "LBXIN.GLU"      "LBDINSI.GLU"    "PHAFSTHR.GLU"   "PHAFSTMN.GLU"  
#> [41] "URXPREG"
lab_features_tibble <- tibble(lab_features)

lab_features_tibble %>%
  head()
#> # A tibble: 6 x 1
#>   lab_features
#>   <chr>       
#> 1 URXUMA      
#> 2 URXUMS      
#> 3 URXUCR      
#> 4 URXCRS      
#> 5 URDACT      
#> 6 URXUMA2
lab_features_tibble <- lab_features_tibble %>%
  mutate(appears_in_other_tables = str_detect(lab_features , "[.]"))

lab_features_tibble  %>%
  head()
#> # A tibble: 6 x 2
#>   lab_features appears_in_other_tables
#>   <chr>        <lgl>                  
#> 1 URXUMA       FALSE                  
#> 2 URXUMS       FALSE                  
#> 3 URXUCR       FALSE                  
#> 4 URXCRS       FALSE                  
#> 5 URDACT       FALSE                  
#> 6 URXUMA2      FALSE
lab_features_tibble %>%
  filter(appears_in_other_tables == TRUE) %>%
  separate(lab_features, into=c('feature','SRC_TBL'), sep='[.]')
#> # A tibble: 11 x 3
#>   feature  SRC_TBL appears_in_other_tables
#>   <chr>    <chr>   <lgl>                  
#> 1 WTSAF2YR TCHOL   TRUE                   
#> 2 WTSAF2YR INS     TRUE                   
#> 3 LBXIN    INS     TRUE                   
#> 4 LBDINSI  INS     TRUE                   
#> 5 PHAFSTHR INS     TRUE                   
#> 6 PHAFSTMN INS     TRUE                   
#> # ... with 5 more rows
lab_features_tibble <- tibble(lab_features) %>%
  mutate(tmp = lab_features) %>%
  separate(tmp, 
           into=c('feature','SRC_TBL'), 
           sep='[.]')
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 30 rows [1, 2, 3,
#> 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, 20, 21, ...].

lab_features_tibble
#> # A tibble: 41 x 3
#>   lab_features feature SRC_TBL
#>   <chr>        <chr>   <chr>  
#> 1 URXUMA       URXUMA  <NA>   
#> 2 URXUMS       URXUMS  <NA>   
#> 3 URXUCR       URXUCR  <NA>   
#> 4 URXCRS       URXCRS  <NA>   
#> 5 URDACT       URDACT  <NA>   
#> 6 URXUMA2      URXUMA2 <NA>   
#> # ... with 35 more rows
lab_features_tibble %>%
  filter(feature == 'WTSAF2YR')
#> # A tibble: 3 x 3
#>   lab_features   feature  SRC_TBL
#>   <chr>          <chr>    <chr>  
#> 1 WTSAF2YR.TCHOL WTSAF2YR TCHOL  
#> 2 WTSAF2YR.INS   WTSAF2YR INS    
#> 3 WTSAF2YR.GLU   WTSAF2YR GLU
lab_features_to_map <- lab_features_tibble %>%
  filter(!is.na(SRC_TBL)) %>%
  arrange(feature)

lab_features_to_map
#> # A tibble: 11 x 3
#>   lab_features feature  SRC_TBL
#>   <chr>        <chr>    <chr>  
#> 1 LBDINSI.INS  LBDINSI  INS    
#> 2 LBDINSI.GLU  LBDINSI  GLU    
#> 3 LBXIN.INS    LBXIN    INS    
#> 4 LBXIN.GLU    LBXIN    GLU    
#> 5 PHAFSTHR.INS PHAFSTHR INS    
#> 6 PHAFSTHR.GLU PHAFSTHR GLU    
#> # ... with 5 more rows
lab_features_to_map %>%
  filter(feature == 'WTSAF2YR')
#> # A tibble: 3 x 3
#>   lab_features   feature  SRC_TBL
#>   <chr>          <chr>    <chr>  
#> 1 WTSAF2YR.TCHOL WTSAF2YR TCHOL  
#> 2 WTSAF2YR.INS   WTSAF2YR INS    
#> 3 WTSAF2YR.GLU   WTSAF2YR GLU
prep_LABS_TBL %>%
  filter(!is.na(WTSAF2YR.TCHOL) | !is.na(WTSAF2YR.INS) | !is.na(WTSAF2YR.GLU)) %>%
  select(contains('WTSAF2YR'))
#> # Source:   SQL [?? x 3]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>   WTSAF2YR.TCHOL WTSAF2YR.INS WTSAF2YR.GLU
#>            <dbl>        <dbl>        <dbl>
#> 1             0            NA           0 
#> 2         67557.           NA       67557.
#> 3         80194.           NA       80194.
#> 4         15668.           NA       15668.
#> 5         93400.           NA       93400.
#> 6             0            NA           0 
#> # ... with more rows
LABS_TBL <- prep_LABS_TBL %>%
  mutate(WTSAF2YR = coalesce(WTSAF2YR.INS, WTSAF2YR.TCHOL , WTSAF2YR.GLU))

LABS_TBL %>%
  select(yr_range, WTSAF2YR, WTSAF2YR.TCHOL, WTSAF2YR.INS, WTSAF2YR.GLU) %>%
  filter(!is.na(WTSAF2YR))
#> # Source:   SQL [?? x 5]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>   yr_range  WTSAF2YR WTSAF2YR.TCHOL WTSAF2YR.INS WTSAF2YR.GLU
#>   <chr>        <dbl>          <dbl>        <dbl>        <dbl>
#> 1 2005-2006       0              0            NA           0 
#> 2 2005-2006   67557.         67557.           NA       67557.
#> 3 2005-2006   80194.         80194.           NA       80194.
#> 4 2005-2006   15668.         15668.           NA       15668.
#> 5 2005-2006   93400.         93400.           NA       93400.
#> 6 2005-2006       0              0            NA           0 
#> # ... with more rows
lab_features_to_map %>%
  filter(feature == 'LBDINSI')
#> # A tibble: 2 x 3
#>   lab_features feature SRC_TBL
#>   <chr>        <chr>   <chr>  
#> 1 LBDINSI.INS  LBDINSI INS    
#> 2 LBDINSI.GLU  LBDINSI GLU
prep_LABS_TBL %>%
  mutate(LBDINSI = coalesce(LBDINSI.INS, LBDINSI.GLU)) %>%
  filter(!is.na(LBDINSI.INS) | !is.na(LBDINSI.GLU) ) %>%
  select(contains('LBDINSI'))
#> # Source:   SQL [?? x 3]
#> # Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#>   LBDINSI.INS LBDINSI.GLU LBDINSI
#>         <dbl>       <dbl>   <dbl>
#> 1          NA        60.2    60.2
#> 2          NA        53.9    53.9
#> 3          NA        67.6    67.6
#> 4          NA        87.1    87.1
#> 5          NA        62.8    62.8
#> 6          NA        23.5    23.5
#> # ... with more rows
check_LABS_TBL <- prep_LABS_TBL %>%
  mutate(WTSAF2YR = coalesce(WTSAF2YR.TCHOL, WTSAF2YR.INS, WTSAF2YR.GLU)) %>%
  mutate(PHAFSTMN = coalesce(PHAFSTMN.INS, PHAFSTMN.GLU)) %>%
  mutate(PHAFSTHR = coalesce(PHAFSTHR.INS, PHAFSTHR.GLU)) %>%
  mutate(LBXIN = coalesce(LBXIN.INS,LBXIN.GLU)) %>%
  mutate(LBDINSI = coalesce(LBDINSI.INS,LBDINSI.GLU))

check_maped_cols <- check_LABS_TBL %>%
  select(contains(c('WTSAF2YR','PHAFSTMN','PHAFSTHR','LBXIN','LBDINSI'))) %>%
  colnames()

check_maped_cols
#>  [1] "WTSAF2YR.TCHOL" "WTSAF2YR.INS"   "WTSAF2YR.GLU"   "WTSAF2YR"      
#>  [5] "PHAFSTMN.INS"   "PHAFSTMN.GLU"   "PHAFSTMN"       "PHAFSTHR.INS"  
#>  [9] "PHAFSTHR.GLU"   "PHAFSTHR"       "LBXIN.INS"      "LBXIN.GLU"     
#> [13] "LBXIN"          "LBDINSI.INS"    "LBDINSI.GLU"    "LBDINSI"
check_LABS_TBL %>%
  select(all_of(check_maped_cols)) %>%
  filter_at(.vars=check_maped_cols, any_vars(!is.na(.))) %>%
  glimpse()
#> Rows: ??
#> Columns: 16
#> Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#> $ WTSAF2YR.TCHOL <dbl> 0.000, 67556.810, 80193.962, 15668.017, 93399.539, 0.00~
#> $ WTSAF2YR.INS   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ WTSAF2YR.GLU   <dbl> 0.000, 67556.810, 80193.962, 15668.017, 93399.539, 0.00~
#> $ WTSAF2YR       <dbl> 0.000, 67556.810, 80193.962, 15668.017, 93399.539, 0.00~
#> $ PHAFSTMN.INS   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ PHAFSTMN.GLU   <dbl> 3, 9, 29, 32, 35, NA, NA, 54, 17, 54, 33, 0, 27, 35, 22~
#> $ PHAFSTMN       <dbl> 3, 9, 29, 32, 35, NA, NA, 54, 17, 54, 33, 0, 27, 35, 22~
#> $ PHAFSTHR.INS   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ PHAFSTHR.GLU   <dbl> 10, 14, 11, 12, 14, NA, NA, 9, 3, 10, 11, 14, 13, 9, 14~
#> $ PHAFSTHR       <dbl> 10, 14, 11, 12, 14, NA, NA, 9, 3, 10, 11, 14, 13, 9, 14~
#> $ LBXIN.INS      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBXIN.GLU      <dbl> NA, 10.03, 8.99, 11.27, 14.51, NA, NA, 10.46, 3.91, 6.0~
#> $ LBXIN          <dbl> NA, 10.03, 8.99, 11.27, 14.51, NA, NA, 10.46, 3.91, 6.0~
#> $ LBDINSI.INS    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDINSI.GLU    <dbl> NA, 60.18, 53.94, 67.62, 87.06, NA, NA, 62.76, 23.46, 3~
#> $ LBDINSI        <dbl> NA, 60.18, 53.94, 67.62, 87.06, NA, NA, 62.76, 23.46, 3~

12.2.2 Lab Features Final

LABS_TBL <- check_LABS_TBL %>% 
  select(-WTSAF2YR.TCHOL, -WTSAF2YR.INS, -WTSAF2YR.GLU) %>%
  select(-PHAFSTMN.INS, -PHAFSTMN.GLU) %>%
  select(-PHAFSTHR.INS, -PHAFSTHR.GLU) %>%
  select(-LBXIN.INS , -LBXIN.GLU) %>%
  select(-LBDINSI.INS , -LBDINSI.GLU)


LABS_TBL  %>%
  filter_at(vars(lab_features_tibble$feature), any_vars(!is.na(.))) %>%
  glimpse()
#> Rows: ??
#> Columns: 37
#> Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#> $ SEQN     <dbl> 31128, 31129, 31130, 31131, 31132, 31133, 31134, 31137, 31139~
#> $ yr_range <chr> "2005-2006", "2005-2006", "2005-2006", "2005-2006", "2005-200~
#> $ URXUMA   <dbl> 82.1, 17.8, NA, 18.0, 6.5, 25.2, 10.1, 287.5, 5.9, 17.5, 5.1,~
#> $ URXUMS   <dbl> 82.1, 17.8, NA, 18.0, 6.5, 25.2, 10.1, 287.5, 5.9, 17.5, 5.1,~
#> $ URXUCR   <dbl> 291, 288, NA, 202, 162, 275, 140, 329, 125, 176, 91, 74, 143,~
#> $ URXCRS   <dbl> 25724, 25459, NA, 17857, 14321, 24310, 12376, 29084, 11050, 1~
#> $ URDACT   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ URXUMA2  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ URDUMA2S <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ URXUCR2  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ URDUCR2S <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ URDACT2  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ URDUMALC <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ URDUCRLC <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ LBDHDD   <dbl> 55, 46, NA, 39, 59, 54, 49, 45, NA, 57, NA, NA, 44, 45, 48, 4~
#> $ LBDHDDSI <dbl> 1.42, 1.19, NA, 1.01, 1.53, 1.40, 1.27, 1.16, NA, 1.47, NA, N~
#> $ LBXTR    <dbl> NA, NA, NA, 86, 65, 61, 195, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ LBDTRSI  <dbl> NA, NA, NA, 0.971, 0.734, 0.689, 2.202, NA, NA, NA, NA, NA, N~
#> $ LBDLDL   <dbl> NA, NA, NA, 49, 75, 81, 98, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ LBDLDLSI <dbl> NA, NA, NA, 1.267, 1.940, 2.095, 2.534, NA, NA, NA, NA, NA, N~
#> $ LBXAPB   <dbl> NA, NA, NA, 50, 75, 75, 111, NA, NA, NA, NA, NA, NA, NA, NA, ~
#> $ LBDAPBSI <dbl> NA, NA, NA, 0.50, 0.75, 0.75, 1.11, NA, NA, NA, NA, NA, NA, N~
#> $ LBDLDLM  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ LBDLDMSI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ LBDLDLN  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ LBDLDNSI <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ LBXTC    <dbl> 129, 170, NA, 105, 147, 147, 186, 129, NA, 141, NA, NA, 108, ~
#> $ LBDTCSI  <dbl> 3.34, 4.40, NA, 2.72, 3.80, 3.80, 4.81, 3.34, NA, 3.65, NA, N~
#> $ LBDINLC  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ LBXGLU   <dbl> NA, NA, NA, 90, 157, 84, 100, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ LBDGLUSI <dbl> NA, NA, NA, 4.996, 8.715, 4.663, 5.551, NA, NA, NA, NA, NA, N~
#> $ URXPREG  <dbl> NA, NA, NA, 2, NA, 2, NA, 2, 2, NA, NA, NA, NA, NA, NA, NA, N~
#> $ WTSAF2YR <dbl> NA, NA, 0.00, 67556.81, 80193.96, 15668.02, 93399.54, NA, 0.0~
#> $ PHAFSTMN <dbl> NA, NA, 3, 9, 29, 32, 35, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ PHAFSTHR <dbl> NA, NA, 10, 14, 11, 12, 14, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ LBXIN    <dbl> NA, NA, NA, 10.03, 8.99, 11.27, 14.51, NA, NA, NA, NA, NA, NA~
#> $ LBDINSI  <dbl> NA, NA, NA, 60.18, 53.94, 67.62, 87.06, NA, NA, NA, NA, NA, N~

\(~\)


\(~\)

12.3 Examination Features

# browseURL('https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Examination&CycleBeginYear=2017')
BPX <- tbl(NHANES_DB, 'BPX')
BMX <- tbl(NHANES_DB,'BMX')

12.3.1 Examination Features

MEASURES_TBL <- DEMO %>%
  select(SEQN, yr_range) %>%
  left_join(BPX) %>%
  left_join(BMX)
#> Joining, by = c("SEQN", "yr_range")
#> Joining, by = c("SEQN", "yr_range")

MEASURES_TBL %>%
  glimpse()
#> Rows: ??
#> Columns: 84
#> Database: sqlite 3.38.5 [C:\Users\jkyle\Documents\GitHub\Jeff_Data_Wrangling\DATA\sql_db\NHANES_DB.db]
#> $ SEQN     <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18~
#> $ yr_range <chr> "1999-2000", "1999-2000", "1999-2000", "1999-2000", "1999-200~
#> $ PEASCST1 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
#> $ PEASCTM1 <dbl> 151, 764, 571, 47, 694, 581, 852, 807, 579, 584, 751, 1169, 7~
#> $ PEASCCT1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
#> $ BPXCHR   <dbl> 110, NA, NA, 108, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,~
#> $ BPQ150A  <dbl> NA, 2, 2, NA, 2, 2, 2, 1, 2, 2, 1, 2, 1, 2, 1, 1, NA, NA, NA,~
#> $ BPQ150B  <dbl> NA, 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, NA, NA,~
#> $ BPQ150C  <dbl> NA, 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, NA, NA,~
#> $ BPQ150D  <dbl> NA, 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, NA, NA,~
#> $ BPAARM   <dbl> NA, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA,~
#> $ BPACSZ   <dbl> NA, 3, 2, NA, 4, 2, 4, 2, 3, 4, 3, 4, 2, 3, 3, 3, NA, NA, NA,~
#> $ BPXPLS   <dbl> NA, 68, 104, NA, 66, 70, 58, 96, 84, 58, 62, 64, 102, 72, 68,~
#> $ BPXDB    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BPXPULS  <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
#> $ BPXPTY   <dbl> NA, 1, 1, NA, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, NA, NA, NA,~
#> $ BPXML1   <dbl> NA, 130, 140, NA, 140, 140, 160, 130, 140, 190, 130, 200, 170~
#> $ BPXSY1   <dbl> NA, 106, 110, NA, 122, 116, 130, NA, 104, 152, 110, 182, 140,~
#> $ BPXDI1   <dbl> NA, 58, 60, NA, 82, 64, 78, NA, 60, 98, 52, 108, 78, 56, 68, ~
#> $ BPAEN1   <dbl> NA, 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, NA, NA,~
#> $ BPXSY2   <dbl> NA, 98, 104, NA, 122, 116, 122, 106, 114, 142, 110, 172, 130,~
#> $ BPXDI2   <dbl> NA, 56, 64, NA, 84, 60, 80, 44, 52, 94, 52, 98, 62, NA, 68, 5~
#> $ BPAEN2   <dbl> NA, 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, NA, NA,~
#> $ BPXSY3   <dbl> NA, 98, 112, NA, 122, 112, 124, 100, 110, 142, 104, 176, 130,~
#> $ BPXDI3   <dbl> NA, 56, 62, NA, 82, 80, 82, 48, 48, 96, 50, 100, 70, 58, 70, ~
#> $ BPAEN3   <dbl> NA, 2, 2, NA, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, NA, NA, NA,~
#> $ BPXSY4   <dbl> NA, NA, NA, NA, NA, NA, NA, 96, NA, NA, NA, NA, NA, 138, NA, ~
#> $ BPXDI4   <dbl> NA, NA, NA, NA, NA, NA, NA, 56, NA, NA, NA, NA, NA, 64, NA, N~
#> $ BPAEN4   <dbl> NA, NA, NA, NA, NA, NA, NA, 2, NA, NA, NA, NA, NA, 2, NA, NA,~
#> $ BPXSAR   <dbl> NA, 98, 108, NA, 122, 114, 123, 98, 112, 142, 107, 174, 130, ~
#> $ BPXDAR   <dbl> NA, 56, 63, NA, 83, 70, 81, 52, 50, 95, 51, 99, 66, 61, 69, 6~
#> $ BMAEXLEN <dbl> 289, 376, 199, 170, 277, 252, 303, 257, 178, 218, 311, 274, 2~
#> $ BMAEXSTS <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
#> $ BMAEXCMT <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXWT    <dbl> 12.5, 75.4, 32.9, 13.3, 92.5, 59.2, 78.0, 40.7, 45.5, 111.8, ~
#> $ BMIWT    <dbl> 3, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 3,~
#> $ BMXRECUM <dbl> 93.2, NA, NA, 87.1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMIRECUM <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXHEAD  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMIHEAD  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXHT    <dbl> 91.6, 174.0, 136.6, NA, 178.3, 162.0, 162.9, 162.0, 156.9, 19~
#> $ BMIHT    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXBMI   <dbl> 14.90, 24.90, 17.63, NA, 29.10, 22.56, 29.39, 15.51, 18.48, 3~
#> $ BMXLEG   <dbl> NA, NA, 34.1, NA, 45.2, 39.7, 43.0, 41.6, 42.2, 46.6, 40.5, 4~
#> $ BMILEG   <dbl> NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA~
#> $ BMXCALF  <dbl> NA, 37.5, 29.0, NA, 42.6, 34.0, 37.2, 30.0, 32.7, 43.7, 37.8,~
#> $ BMICALF  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXARML  <dbl> 18.6, 38.2, 25.5, 20.4, 39.7, 34.5, 38.1, 36.7, 32.0, 43.0, 3~
#> $ BMIARML  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXARMC  <dbl> 15.2, 29.8, 19.7, 16.4, 35.8, 26.0, 31.7, 20.1, 22.1, 37.6, 2~
#> $ BMIARMC  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXWAIST <dbl> 45.7, 98.0, 64.7, NA, 99.9, 81.6, 90.7, 64.1, 64.6, 108.0, 76~
#> $ BMIWAIST <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA~
#> $ BMXTHICR <dbl> NA, NA, 38.2, NA, 56.2, 47.0, 55.7, 39.1, 45.3, 64.0, 39.9, 5~
#> $ BMITHICR <dbl> NA, 1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA,~
#> $ BMXTRI   <dbl> 8.3, 12.8, 8.0, 10.8, 17.4, 20.3, 26.4, 6.2, 11.1, 15.5, 8.0,~
#> $ BMITRI   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXSUB   <dbl> 5.0, 20.4, 6.5, 10.2, 38.6, 16.8, 34.2, 5.2, 7.6, 26.6, 7.8, ~
#> $ BMISUB   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, 1, 1, NA, ~
#> $ BMAAMP   <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2~
#> $ BMAUREXT <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMAUPREL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMAULEXT <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMAUPLEL <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMALOREX <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMALORKN <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMALLEXT <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMALLKNE <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDSTATS <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDRECUF <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDSUBF  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDTHICF <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDLEGF  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDARMLF <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDCALFF <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDBMIC  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXSAD1  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXSAD2  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXSAD3  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXSAD4  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDAVSAD <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMDSADCM <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMXHIP   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~
#> $ BMIHIP   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N~

This concludes the section on engineering features.