3.5 Summary

library(tidyverse)

data("mtcars")
df = mtcars %>%
    dplyr::select(cyl, vs, carb)


df_factor = df %>%
    dplyr::mutate(
        cyl = factor(cyl),
        vs = factor(vs),
        carb = factor(carb)
    )
# summary(df)
str(df)
#> 'data.frame':    32 obs. of  3 variables:
#>  $ cyl : num  6 6 4 6 8 6 8 4 4 6 ...
#>  $ vs  : num  0 0 1 1 0 1 0 1 1 1 ...
#>  $ carb: num  4 4 1 1 2 1 4 2 2 4 ...
str(df_factor)
#> 'data.frame':    32 obs. of  3 variables:
#>  $ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
#>  $ vs  : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
#>  $ carb: Factor w/ 6 levels "1","2","3","4",..: 4 4 1 1 2 1 4 2 2 4 ...

Get the correlation table for continuous variables only

cor(df)
#>             cyl         vs       carb
#> cyl   1.0000000 -0.8108118  0.5269883
#> vs   -0.8108118  1.0000000 -0.5696071
#> carb  0.5269883 -0.5696071  1.0000000

# only complete obs
# cor(df, use = "complete.obs")

Alternatively, you can also have the

Hmisc::rcorr(as.matrix(df), type = "pearson")
#>        cyl    vs  carb
#> cyl   1.00 -0.81  0.53
#> vs   -0.81  1.00 -0.57
#> carb  0.53 -0.57  1.00
#> 
#> n= 32 
#> 
#> 
#> P
#>      cyl    vs     carb  
#> cyl         0.0000 0.0019
#> vs   0.0000        0.0007
#> carb 0.0019 0.0007
modelsummary::datasummary_correlation(df)
cyl vs carb
cyl 1 . .
vs −.81 1 .
carb .53 −.57 1
ggcorrplot::ggcorrplot(cor(df))

Different comparison between different correlation between different types of variables (i.e., continuous vs. categorical) can be problematic. Moreover, the problem of detecting non-linear vs. linear relationship/correlation is another one. Hence, a solution is that using mutual information from information theory (i.e., knowing one variable can reduce uncertainty about the other).

To implement mutual information, we have the following approximations

\[ \downarrow \text{prediction error} \approx \downarrow \text{uncertainty} \approx \downarrow \text{association strength} \]

More specifically, following the X2Y metric, we have the following steps:

  1. Predict \(y\) without \(x\) (i.e., baseline model)

    1. Average of \(y\) when \(y\) is continuous

    2. Most frequent value when \(y\) is categorical

  2. Predict \(y\) with \(x\) (e.g., linear, random forest, etc.)

  3. Calculate the prediction error difference between 1 and 2

To have a comprehensive table that could handle

  • continuous vs. continuous

  • categorical vs. continuous

  • continuous vs. categorical

  • categorical vs. categorical

the suggested model would be Classification and Regression Trees (CART). But we can certainly use other models as well.

The downfall of this method is that you might suffer

  1. Symmetry: \((x,y) \neq (y,x)\)
  2. Comparability : Different pair of comparison might use different metrics (e.g., misclassification error vs. MAE)
library(ppsr)

iris <- iris %>% 
  select(1:3)

# ppsr::score_df(iris) # if you want a dataframe
ppsr::score_matrix(iris,
                   do_parallel = TRUE,
                   n_cores = parallel::detectCores() / 2)
#>              Sepal.Length Sepal.Width Petal.Length
#> Sepal.Length   1.00000000  0.04632352    0.5491398
#> Sepal.Width    0.06790301  1.00000000    0.2376991
#> Petal.Length   0.61608360  0.24263851    1.0000000

# if you want a similar correlation matrix
ppsr::score_matrix(df,
                   do_parallel = TRUE,
                   n_cores = parallel::detectCores() / 2)
#>             cyl        vs      carb
#> cyl  1.00000000 0.3982789 0.2092533
#> vs   0.02514286 1.0000000 0.2000000
#> carb 0.30798148 0.2537309 1.0000000

3.5.1 Visualization

corrplot::corrplot(cor(df))

Alternatively,

PerformanceAnalytics::chart.Correlation(df, histogram = T, pch = 19)

heatmap(as.matrix(df))

More general form,

ppsr::visualize_pps(
    df = iris,
    do_parallel = TRUE,
    n_cores = parallel::detectCores() / 2
)

ppsr::visualize_correlations(
    df = iris
)

Both heat map and correlation at the same time

ppsr::visualize_both(
    df = iris,
    do_parallel = TRUE,
    n_cores = parallel::detectCores() / 2
)

More elaboration with ggplot2

ppsr::visualize_pps(
    df = iris,
    color_value_high = 'red',
    color_value_low = 'yellow',
    color_text = 'black'
) +
    ggplot2::theme_classic() +
    ggplot2::theme(plot.background = 
                       ggplot2::element_rect(fill = "lightgrey")) +
    ggplot2::theme(title = ggplot2::element_text(size = 15)) +
    ggplot2::labs(
        title = 'Correlation aand Heatmap',
        subtitle = 'Subtitle',
        caption = 'Caption',
        x = 'More info'
    )