We’re interested in how varying the correlation of PRS with the confounder C affects the distribution of the confounder among cases (Y) and controls (non-Y).
To simulate this in R, you can follow these steps:
Define Your Variables and Model: Based on the DAG, it appears PRS affects C, which in turn affects both exposure (X) and the outcome (Y).
Set Up Simulation Parameters: You’ll need to decide the base rates for each variable, the strength of the causal effects (expressed as probabilities or regression coefficients), and the range of correlations between PRS and C you want to explore.
Simulate Data: Use the rnorm() function to generate normally distributed data for PRS and C, adjusting the means and standard deviations to reflect the varying correlation. Use a statistical model (e.g., logistic regression) to simulate the effects of C on X and Y.
Calculate Enrichment: For each simulation, you’ll need to calculate the proportion of cases and controls that have the comorbidity (C).
Repeat and Analyze: Repeat the simulation multiple times for each level of correlation to get a distribution of outcomes. Analyze how the enrichment of the confounder among cases and controls changes with different correlations.
Recall that
The formula for creating C from PRS with correlation \(\r^2\) would be: \[C=r×PRS+\sqrt{(1−r2)} \cdot ϵ\]
\(C=r×PRS \cdot \sqrt{(1−r2)}\cdot \epsilon\)
where \(\epsilon\) is a standard normal error term, independent of PRS.
The term is there to ensure that the variance of C is 1, which is the variance of PRS. The variance of C can be calculated as follows:
Since both PRS and \(\epsilon\) have a variance of 1: \(Var(C)=1\) This ensures that C will have a standard deviation of 1, just like PRS, and the specified correlation rr with PRS.
# Set the number of simulationsn_simulations <-100n_individuals <-10000correlations <-seq(0, 1, by =0.1) # Varying correlations from -1 to 1# Empty dataframe to store resultssimulation_results <-data.frame(correlation =numeric(),enrichment_cases =numeric(),enrichment_controls =numeric())simulation_results <-expand.grid(correlation = correlations, sim_id =1:n_simulations)simulation_results$enrichment_cases <-NAsimulation_results$enrichment_controls <-NA# Simulation indexsim_index <-1beta_X=10beta_Y=10beta_YX=10cor_sim=array(data=NA,dim =c(length(correlations),n_simulations,n_individuals,4))dimnames(cor_sim)=list(correlations,1:n_simulations,1:n_individuals,c("PRS","C","X","Y"))for (i in1:length(correlations)) { r=correlations[i]for (j in1:n_simulations) {# Simulate PRS scores PRS <-rnorm(n_individuals, mean =0, sd =1) cor_sim[i,j,,1]=PRS# Generate an independent standard normal error term epsilon <-rnorm(n_individuals, mean =0, sd =1)# Simulate confounder C with varying correlation with PRS C <- r * PRS +sqrt(1- r^2) * epsilon cor_sim[i,j,,2]=C# Model the effect of C on X and Y using logistic regression# Ensure that the probabilities are reasonable prob_X <-plogis(beta_X * C) # You need to define beta_X X <-rbinom(n_individuals, 1, prob = prob_X) cor_sim[i,j,,3]=X prob_Y <-plogis(beta_Y * C + beta_YX * X) # You need to define beta_Y and beta_YX Y <-rbinom(n_individuals, 1, prob = prob_Y) cor_sim[i,j,,4]=Y }}plot(cor_sim["0.1",1,,"PRS"],cor_sim["0.1","1",,"C"])
# Assuming beta_X, beta_Y, and beta_YX are defined# Initialize a list to hold resultsenrichment_results <-data.frame()# Loop over correlation levelsfor (i in1:length(correlations)) {# Loop over simulationsfor (j in1:n_simulations) {# Extract the data for the current correlation and simulation sim_data <- cor_sim[i, j, , ]# Calculate PRS deciles PRS_deciles <-as.integer(cut(sim_data[, "PRS"], breaks =quantile(sim_data[, "PRS"], probs =0:10/10), include.lowest =TRUE))# Initialize vectors to store enrichment for cases and controls for each decile enrichment_cases <-numeric(10) enrichment_controls <-numeric(10)# Loop over decilesfor (decile in1:10) {# Get the indices for the current decile decile_indices <-which(PRS_deciles == decile)# Calculate enrichment for cases and controls within the decile enrichment_cases[decile] <-mean(sim_data[decile_indices, "C"][sim_data[decile_indices, "X"] ==1]) enrichment_controls[decile] <-mean(sim_data[decile_indices, "C"][sim_data[decile_indices, "X"] ==0]) }# Combine enrichment for cases and controls and add to the results data frame enrichment_results <-rbind(enrichment_results, data.frame(correlation = correlations[i],simulation = j,decile =1:10,enrichment_cases = enrichment_cases,enrichment_controls = enrichment_controls )) }}# Now you can plot using ggplot2 or base R plotting functions
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
# Summarize enrichment across simulations for each decile and correlation levelsummary_results <- enrichment_results %>%group_by(correlation, decile) %>%summarize(mean_enrichment_cases =mean(enrichment_cases),sd_enrichment_cases =sd(enrichment_cases),mean_enrichment_controls =mean(enrichment_controls),sd_enrichment_controls =sd(enrichment_controls) )
`summarise()` has grouped output by 'correlation'. You can override using the
`.groups` argument.
# Assuming simulation_results is already summarized with mean valueslibrary(ggplot2)# Assuming your summary dataframe is set up correctlyggplot(summary_results, aes(x =as.factor(decile), y = mean_enrichment_cases, group =as.factor(decile), color =as.factor(decile))) +geom_point() +geom_errorbar(aes(ymin = mean_enrichment_cases - sd_enrichment_cases, ymax = mean_enrichment_cases + sd_enrichment_cases), width =0.1) +facet_wrap(~ correlation, scales ="free_y") +labs(x ="PRS Decile", y ="Mean Confounding Score of Cases (SD)", title =" Confounder Enrichment by PRS Decile and Correlation",color="PRS Decile") +theme_minimal() +theme(legend.position ="bottom")
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
We can simulate a few scenarios:
# Disease Scenarios - Define your betas based on literature scenarios <-list("CAD"=list(beta_X =0.5, beta_Y =0.2, beta_YX =0.4),"ALZ"=list(beta_X =0.2, beta_Y =0.3, beta_YX =0.2),"BC"=list(beta_X = ..., beta_Y = ..., beta_YX = ...) # Fill in for Breast Cancer) for(s in1:length(scenarios)){for (i in1:length(correlations)) { r <- correlations[i] for (j in1:n_simulations) { PRS <-rnorm(n_individuals, mean =0, sd =1)# Confounder Simulation (Example with CAD - Blood Pressure) confounder <- r * PRS +sqrt(1- r^2) * epsilon cor_sim[i, j, , 2] <- confounder# Select disease scenario (Example with CAD) scenario <- scenarios[[x]] # Model the effect of C on X and Y using logistic regression prob_X <-plogis(scenario$beta_X * confounder) X <-rbinom(n_individuals, 1, prob = prob_X) cor_sim[i, j, , 3] <- X prob_Y <-plogis(scenario$beta_Y * confounder + scenario$beta_YX * X) Y <-rbinom(n_individuals, 1, prob = prob_Y) cor_sim[i, j, , 4] <- Y }}
library(dplyr)library(ggplot2)library(dplyr)summary_results <- enrichment_results %>%group_by(correlation, decile) %>%summarize(mean_enrichment_cases =mean(enrichment_cases),se_enrichment_cases =sd(enrichment_cases) /sqrt(n()),mean_enrichment_controls =mean(enrichment_controls),se_enrichment_controls =sd(enrichment_controls) /sqrt(n()),.groups ='drop' )library(ggplot2)ggplot(summary_results, aes(x =as.factor(decile))) +geom_line(aes(y = mean_enrichment_cases, group = correlation, color ="Cases"), size =1) +geom_errorbar(aes(ymin = mean_enrichment_cases - se_enrichment_cases, ymax = mean_enrichment_cases + se_enrichment_cases, group = correlation, color ="Cases"), width =0.2) +geom_line(aes(y = mean_enrichment_controls, group = correlation, color ="Controls"), size =1, linetype ="dashed") +geom_errorbar(aes(ymin = mean_enrichment_controls - se_enrichment_controls, ymax = mean_enrichment_controls + se_enrichment_controls, group = correlation, color ="Controls"), width =0.2, linetype ="dashed") +facet_wrap(~ correlation, scales ="free_y") +labs(x ="PRS Decile", y ="Mean Confounder Level", title ="Mean Confounder Level by PRS Decile", color ="Group") +scale_color_manual(values =c("Cases"="blue", "Controls"="red")) +theme_minimal() +theme(legend.position ="bottom")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).
Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).