Simulations for Jakob

DAG

We’re interested in how varying the correlation of PRS with the confounder C affects the distribution of the confounder among cases (Y) and controls (non-Y).

To simulate this in R, you can follow these steps:

Define Your Variables and Model: Based on the DAG, it appears PRS affects C, which in turn affects both exposure (X) and the outcome (Y).
Set Up Simulation Parameters: You’ll need to decide the base rates for each variable, the strength of the causal effects (expressed as probabilities or regression coefficients), and the range of correlations between PRS and C you want to explore.
Simulate Data: Use the rnorm() function to generate normally distributed data for PRS and C, adjusting the means and standard deviations to reflect the varying correlation. Use a statistical model (e.g., logistic regression) to simulate the effects of C on X and Y.
Calculate Enrichment: For each simulation, you’ll need to calculate the proportion of cases and controls that have the comorbidity (C).
Repeat and Analyze: Repeat the simulation multiple times for each level of correlation to get a distribution of outcomes. Analyze how the enrichment of the confounder among cases and controls changes with different correlations.

Recall that

The formula for creating C from PRS with correlation $\r^2$ would be: \[C=r×PRS+\sqrt{(1−r2)} \cdot ϵ\]

$C=r×PRS \cdot \sqrt{(1−r2)}\cdot \epsilon$

where $\epsilon$ is a standard normal error term, independent of PRS.

The term is there to ensure that the variance of C is 1, which is the variance of PRS. The variance of C can be calculated as follows:

$Var(C)=\mathcal(Var) (r\cdot PRS)+Var(1−r2\cdot \epsilon)$

$$Var(C)=r2\cdot Var(PRS)+(1−r2)×Var(ϵ)$

$$Var(C)=r^2 \cdot 1+(1−r^2) \cdot 1$

Since both PRS and $\epsilon$ have a variance of 1: $Var(C)=1$ This ensures that C will have a standard deviation of 1, just like PRS, and the specified correlation rr with PRS.

# Set the number of simulations
n_simulations <- 100
n_individuals <- 10000
correlations <- seq(0, 1, by = 0.1) # Varying correlations from -1 to 1

# Empty dataframe to store results
simulation_results <- data.frame(correlation = numeric(),
                                 enrichment_cases = numeric(),
                                 enrichment_controls = numeric())
simulation_results <- expand.grid(correlation = correlations, sim_id = 1:n_simulations)
simulation_results$enrichment_cases <- NA
simulation_results$enrichment_controls <- NA

# Simulation index
sim_index <- 1
beta_X=10
beta_Y=10
beta_YX=10

cor_sim=array(data=NA,dim = c(length(correlations),n_simulations,n_individuals,4))
dimnames(cor_sim)=list(correlations,1:n_simulations,1:n_individuals,c("PRS","C","X","Y"))

for (i in 1:length(correlations)) {
  r=correlations[i]
    for (j in 1:n_simulations) {
        # Simulate PRS scores
        PRS <- rnorm(n_individuals, mean = 0, sd = 1)
        cor_sim[i,j,,1]=PRS
        # Generate an independent standard normal error term
        epsilon <- rnorm(n_individuals, mean = 0, sd = 1)
        
        # Simulate confounder C with varying correlation with PRS
        C <- r * PRS + sqrt(1 - r^2) * epsilon
        cor_sim[i,j,,2]=C
        # Model the effect of C on X and Y using logistic regression
        # Ensure that the probabilities are reasonable
        prob_X <- plogis(beta_X * C)  # You need to define beta_X
        X <- rbinom(n_individuals, 1, prob = prob_X)
        cor_sim[i,j,,3]=X
        
        prob_Y <- plogis(beta_Y * C + beta_YX * X)  # You need to define beta_Y and beta_YX
        
        
        
        Y <- rbinom(n_individuals, 1, prob = prob_Y)
        cor_sim[i,j,,4]=Y
      
    }
}


plot(cor_sim["0.1",1,,"PRS"],cor_sim["0.1","1",,"C"])

plot(cor_sim["0.9",1,,"PRS"],cor_sim["0.9","1",,"C"])

# Assuming beta_X, beta_Y, and beta_YX are defined

# Initialize a list to hold results
enrichment_results <- data.frame()

# Loop over correlation levels
for (i in 1:length(correlations)) {
  # Loop over simulations
  for (j in 1:n_simulations) {
    # Extract the data for the current correlation and simulation
    sim_data <- cor_sim[i, j, , ]
    
    # Calculate PRS deciles
    PRS_deciles <- as.integer(cut(sim_data[, "PRS"], breaks = quantile(sim_data[, "PRS"], probs = 0:10/10), include.lowest = TRUE))
    
    # Initialize vectors to store enrichment for cases and controls for each decile
    enrichment_cases <- numeric(10)
    enrichment_controls <- numeric(10)
    
    # Loop over deciles
    for (decile in 1:10) {
      # Get the indices for the current decile
      decile_indices <- which(PRS_deciles == decile)
      
      # Calculate enrichment for cases and controls within the decile
      enrichment_cases[decile] <- mean(sim_data[decile_indices, "C"][sim_data[decile_indices, "X"] == 1])
      enrichment_controls[decile] <- mean(sim_data[decile_indices, "C"][sim_data[decile_indices, "X"] == 0])
    }
    
    # Combine enrichment for cases and controls and add to the results data frame
    enrichment_results <- rbind(enrichment_results, data.frame(
      correlation = correlations[i],
      simulation = j,
      decile = 1:10,
      enrichment_cases = enrichment_cases,
      enrichment_controls = enrichment_controls
    ))
  }
}
# Now you can plot using ggplot2 or base R plotting functions

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

# Summarize enrichment across simulations for each decile and correlation level
summary_results <- enrichment_results %>%
  group_by(correlation, decile) %>%
  summarize(
    mean_enrichment_cases = mean(enrichment_cases),
    sd_enrichment_cases = sd(enrichment_cases),
    mean_enrichment_controls = mean(enrichment_controls),
    sd_enrichment_controls = sd(enrichment_controls)
  )

`summarise()` has grouped output by 'correlation'. You can override using the
`.groups` argument.

# Assuming simulation_results is already summarized with mean values
library(ggplot2)

# Assuming your summary dataframe is set up correctly

  
ggplot(summary_results, aes(x = as.factor(decile), y = mean_enrichment_cases, group = as.factor(decile), color = as.factor(decile))) +
geom_point() +
geom_errorbar(aes(ymin = mean_enrichment_cases - sd_enrichment_cases, ymax = mean_enrichment_cases + sd_enrichment_cases), width = 0.1) +
facet_wrap(~ correlation, scales = "free_y") +
labs(x = "PRS Decile", y = "Mean Confounding Score of Cases (SD)", title = " Confounder Enrichment by PRS Decile and Correlation",color="PRS Decile") +
theme_minimal() +
theme(legend.position = "bottom")

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

We can simulate a few scenarios:

# Disease Scenarios - Define your betas based on literature 
scenarios <- list(
  "CAD" = list(beta_X = 0.5, beta_Y = 0.2, beta_YX = 0.4),
  "ALZ" = list(beta_X = 0.2, beta_Y = 0.3, beta_YX = 0.2),
  "BC"  = list(beta_X = ..., beta_Y = ..., beta_YX = ...) # Fill in for Breast Cancer
) 

for(s in 1:length(scenarios)){
  

for (i in 1:length(correlations)) {
  r <- correlations[i] 

  for (j in 1:n_simulations) {

   PRS <- rnorm(n_individuals, mean = 0, sd = 1)
  
    # Confounder Simulation (Example with CAD - Blood Pressure)
    confounder  <- r * PRS + sqrt(1 - r^2) * epsilon
    cor_sim[i, j, , 2] <- confounder

    # Select disease scenario (Example with CAD) 
    scenario <- scenarios[[x]] 

    # Model the effect of C on X and Y using logistic regression
    prob_X <- plogis(scenario$beta_X * confounder) 
    X <- rbinom(n_individuals, 1, prob = prob_X)
    cor_sim[i, j, , 3] <- X 

    prob_Y <- plogis(scenario$beta_Y * confounder + scenario$beta_YX * X) 
    Y <- rbinom(n_individuals, 1, prob = prob_Y)
    cor_sim[i, j, , 4] <- Y 
  }
}

library(dplyr)
library(ggplot2)
library(dplyr)

summary_results <- enrichment_results %>%
  group_by(correlation, decile) %>%
  summarize(
    mean_enrichment_cases = mean(enrichment_cases),
    se_enrichment_cases = sd(enrichment_cases) / sqrt(n()),
    mean_enrichment_controls = mean(enrichment_controls),
    se_enrichment_controls = sd(enrichment_controls) / sqrt(n()),
    .groups = 'drop'
  )

library(ggplot2)

ggplot(summary_results, aes(x = as.factor(decile))) +
  geom_line(aes(y = mean_enrichment_cases, group = correlation, color = "Cases"), size = 1) +
  geom_errorbar(aes(ymin = mean_enrichment_cases - se_enrichment_cases, ymax = mean_enrichment_cases + se_enrichment_cases, group = correlation, color = "Cases"), width = 0.2) +
  geom_line(aes(y = mean_enrichment_controls, group = correlation, color = "Controls"), size = 1, linetype = "dashed") +
  geom_errorbar(aes(ymin = mean_enrichment_controls - se_enrichment_controls, ymax = mean_enrichment_controls + se_enrichment_controls, group = correlation, color = "Controls"), width = 0.2, linetype = "dashed") +
  facet_wrap(~ correlation, scales = "free_y") +
  labs(x = "PRS Decile", y = "Mean Confounder Level", title = "Mean Confounder Level by PRS Decile", color = "Group") +
  scale_color_manual(values = c("Cases" = "blue", "Controls" = "red")) +
  theme_minimal() +
  theme(legend.position = "bottom")

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).