baseline_covs =fread("mgbb_data/mgbb_pce_consent_date.csv")pheno_all =left_join(pheno_incident, pheno_has[, c("Subject_Id", "prevalent_mi_date", "prevalent_mi")], by ="Subject_Id")pheno_all_with_covs =left_join(pheno_all, baseline_covs[, c("EMPI","mgbb_consent_date","enrollment_age","sex_imputed","lipidmed")])## we can see that the rates are much higher for biobank particantspheno_all$either =ifelse((pheno_all$prevalent_mi + pheno_all$incident_mi) >0, 1, 0)pheno_all %>%group_by(inbiobank) %>%summarise(mean(either))
PCS =fread("mgbb_data/GSA_53K.hg38.insamplePCA.tsv.gz")pheno_dat =merge(pheno_all_with_covs, ldl, by ="EMPI")dat_with_PCS =merge(PCS[, c(1:11)], pheno_dat, by ="IID")final_dat =merge(ancestry, dat_with_PCS, by ="IID")final_dat <- final_dat %>%mutate(mgbb_consent_date =as.Date(mgbb_consent_date, "%Y-%m-%d"),incident_cad_date =as.Date(incident_mi_date, "%Y-%m-%d"),prevalent_cad_date =as.Date(prevalent_mi_date, "%Y-%m-%d"),birthdate = mgbb_consent_date - (enrollment_age *365.25),# Approximating leap yearsage_at_incident_cad =as.numeric(difftime(incident_mi_date, birthdate, units ="days")) /365.25,age_at_prevalent_cad =ifelse(is.na(prevalent_mi_date), NA, as.numeric(difftime(prevalent_cad_date, birthdate, units ="days") ) /365.25) )final_dat <- final_dat %>%mutate(# Calculate censor flagCAD_censor =ifelse(incident_mi ==1| prevalent_mi ==1, 1, 0),# Calculate age at censoring or follow-upCAD_censor_age =case_when(# Use age at prevalent CAD if prevalent CAD exists!is.na(age_at_prevalent_cad) & prevalent_mi ==1~ age_at_prevalent_cad,# Use age at incident CAD if no prevalent available!is.na(age_at_incident_cad) & incident_mi ==1~ age_at_incident_cad,# Calculate age at follow-up end date if no CADTRUE~as.numeric(difftime(as.Date("2023-06-15"), birthdate, units ="days")) /365.25 ) )### note that only 26000 people had LDLfinal_dat$ldladj =ifelse(final_dat$lipidmed =="yes", final_dat$ldlc *5/4, final_dat$ldlc)
Phenotyping
We validated previously that MGB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.
Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events
Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.
Min. 1st Qu. Median Mean 3rd Qu. Max.
-5.22464 -0.58991 0.09983 0.00000 0.68542 3.85568
Code
ggplot(totaldata, aes(x = pop_scaled_prs, y = Population, fill = Population)) +geom_density_ridges(alpha =0.7) +scale_fill_brewer(palette ="Set1") +theme_ridges() +# Optional: Adds a nice theme suitable for ridgeline plotslabs(title ="Density Distribution of Sum Scores: Scaled for all",x ="Sum Score",y ="Population")
The following objects are masked from 'package:base':
units, units<-
Code
totaldata$followup <-round(difftime(totaldata$CAD_censor_age, totaldata$enrollment_age), 2)dat <- totaldatalabel(dat$sex_imputed)="sex"label(dat$CAD_censor)="CAD"dat$CAD_censor=as.factor(dat$CAD_censor)#table1(~sex_imputed+ldladj+CAD_censor+enrollment_age+pop_scaled_prs+scaled_resid|knn,data = totaldata)# Format the pop_scaled_prs variable to 2 decimal placesdat$pop_scaled_prs <-as.numeric(as.character(round(dat$pop_scaled_prs,2)))dat <- totaldatadat$CAD_censor <-as.factor(dat$CAD_censor)label(dat$sex_imputed) <-"Sex"label(dat$ldladj) <-"LDL-C (adjusted for Statin)"label(dat$CAD_censor) <-"CAD Ever"label(dat$scaled_resid) <-"Overall Scaled Residual PRS"label(dat$pop_scaled_prs) <-"Pop Scaled Residual PRS"label(dat$enrollment_age) ="Age Enrolled"# Continue with creating the tabletable1(~sex_imputed + ldladj + CAD_censor + enrollment_age +round(pop_scaled_prs,4) + scaled_resid | knn, data = dat)
AFR (N=2890)
AMR (N=3849)
EAS (N=1061)
EUR (N=44870)
SAS (N=605)
Overall (N=53275)
Sex
female
1821 (63.0%)
2528 (65.7%)
666 (62.8%)
24340 (54.2%)
294 (48.6%)
29649 (55.7%)
male
1069 (37.0%)
1321 (34.3%)
395 (37.2%)
20530 (45.8%)
311 (51.4%)
23626 (44.3%)
LDL-C (adjusted for Statin)
Mean (SD)
113 (48.1)
109 (44.4)
105 (36.4)
110 (37.9)
105 (36.1)
110 (39.1)
Median [Min, Max]
106 [11.0, 828]
104 [15.0, 1090]
101 [26.3, 246]
106 [12.5, 568]
99.0 [25.0, 235]
106 [11.0, 1090]
Missing
1223 (42.3%)
1974 (51.3%)
598 (56.4%)
22676 (50.5%)
322 (53.2%)
26793 (50.3%)
CAD Ever
0
1750 (60.6%)
2801 (72.8%)
952 (89.7%)
34652 (77.2%)
515 (85.1%)
40670 (76.3%)
1
1140 (39.4%)
1048 (27.2%)
109 (10.3%)
10218 (22.8%)
90 (14.9%)
12605 (23.7%)
Age Enrolled
Mean (SD)
48.7 (16.3)
42.2 (15.6)
41.9 (16.0)
55.1 (16.9)
41.4 (15.9)
53.4 (17.2)
Median [Min, Max]
49.4 [17.8, 98.4]
40.1 [1.65, 95.5]
38.6 [17.9, 89.1]
57.7 [1.98, 102]
37.3 [3.66, 85.9]
55.8 [1.65, 102]
Pop Scaled Residual PRS
Mean (SD)
-0.0000000692 (1.00)
-0.0000000779 (1.00)
-0.00000189 (1.00)
0.0000000713 (1.00)
0.00000116 (1.00)
0.0000000263 (1.00)
Median [Min, Max]
0.239 [-4.82, 2.18]
0.0800 [-4.69, 2.75]
0.276 [-5.22, 1.90]
0.0841 [-5.08, 3.86]
0.0690 [-4.09, 2.77]
0.0998 [-5.22, 3.86]
Overall Scaled Residual PRS
Mean (SD)
-0.0797 (1.25)
0.217 (0.992)
-0.0303 (1.01)
-0.0114 (0.982)
-0.0991 (0.779)
-0.0000000000000000371 (1.00)
Median [Min, Max]
0.219 [-6.12, 2.65]
0.296 [-4.44, 2.95]
0.250 [-5.33, 1.89]
0.0711 [-5.00, 3.78]
-0.0453 [-3.28, 2.06]
0.0986 [-6.12, 3.78]
Impact of PRS
To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:
Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.
Step 2: Associate Rescaled LDL-C PRS with CAD
Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.
Here’s an example in R for the European population. The same steps would be repeated for the African population:
Code
library(dplyr)library(survival)# Assuming 'df' is your dataset# List of unique populationspopulations <-c("AFR", "SAS", "EUR", "AMR", "EAS")# Initialize an empty list for storing models and resultscox_models <-list()glm_models =list()results <-data.frame(Population =character(),PRS_Mean =numeric(),PRS_SD =numeric(),Percent_male =numeric(),LDL_Coef_CI_Lower =numeric(),LDL_Coef_CI_Upper =numeric(),Population_Scale_Factor =numeric(),LDL_Scaled_Coef_CI_Lower =numeric(),LDL_Scaled_Coef_CI_Upper =numeric(),HR =numeric(),HR_CI_Lower =numeric(),HR_CI_Upper =numeric(),OR =numeric(),OR_LCI =numeric(),OR_UCI =numeric(),stringsAsFactors =FALSE)for (pop in populations) {# Subset data for the specific population pop_data = totaldata[totaldata$knn %in% pop,]# scale the residual score per pouplation#pop_data$PRS = scale(pop_data$residscore) pop_data$PRS = pop_data$pop_scaled_prs##print("PRS Summary")#print(summary(pop_data$PRS)) ldl_model <-lm( ldladj ~ PRS +as.numeric(enrollment_age) +as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data[!is.na(pop_data$ldladj),] ) ldl_coef_ci <-confint(ldl_model, "PRS")#print(paste("LDL_coef_CI for population:", pop))if ("PRS"%in%names(coef(ldl_model))) {#print(confint(ldl_model, parm = "PRS")) } else {#print("PRS coefficient is not available in the model.") }# Calculate the PRS scaling factor prs_scale_factor <-40/coef(ldl_model)["PRS"]#print(paste0(pop,":",prs_scale_factor))#prs_scale_factor=1# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor ldl_model_2 <-lm( ldladj ~ scaled_ldl_prs +as.numeric(enrollment_age) +as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data[!is.na(pop_data$ldladj),] ) ldl_scaled_coef_ci <-confint(ldl_model_2, "scaled_ldl_prs")#print(summary(pop_data$scaled_ldl_prs))# Step 2: Cox Model for CAD## since age is time scale don't use cox_model <-coxph(Surv(CAD_censor_age, CAD_censor) ~ scaled_ldl_prs+as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data )#cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial") cox_models[[pop]] <- cox_model glm_model <-glm( CAD_censor ~ scaled_ldl_prs + enrollment_age+as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,data = pop_data,family ="binomial" ) glm_models[[pop]] <- glm_model glm_models[[pop]] <- glm_model log_or =summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"] log_or_se =summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"] or =round(exp(log_or), 2) lci =round(exp(log_or -1.96* log_or_se), 2) uci =round(exp(log_or +1.96* log_or_se), 2) glmci =confint(glm_model)# Extract summary stats cox_summary <-summary(cox_model) glm_summary <-summary(glm_model)# Compile results into the data frame# Append to results results <-rbind( results,data.frame(Population = pop,PRS_Mean =mean(pop_data$PRS),PRS_SD =sd(pop_data$PRS),Percent_male =mean(pop_data$sex_imputed =="male"),LDL_Coef_CI_Lower = ldl_coef_ci[1],LDL_Coef_CI_Upper = ldl_coef_ci[2],Population_Scale_Factor = prs_scale_factor,LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],# Adjust according to your calculationsLDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],HR = cox_summary$conf.int["scaled_ldl_prs", 1],HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3],# Assuming 'cox_summary' is properly definedHR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4],# Assuming 'cox_summary' is properly definedOR = or,OR_LCI =exp(glmci['scaled_ldl_prs', 1]),OR_UCI =exp(glmci['scaled_ldl_prs', 2]) ) )}
Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
saveRDS(results, "results_mgb.rds")# Access and summarize the Cox models for each populationlapply(cox_models, function(x) {exp(coef(x)["scaled_ldl_prs"])})
Now we want to take a two-step Bayesian approach where you first calculate a global mean hazard ratio (HR) or odds ratio (OR) from population-specific estimates and then use tAMR global mean as a prior in a Bayesian adjustment of the population-specific estimates. In such a case, weighting the global mean by the standard error (or variance) of each population-specific estimate is indeed a good approach, as it accounts for the different levels of uncertainty in each population estimate.
Calculate Weighted Global Mean HR/OR:
For each population, you have an HR/OR and its standard error (SE).
You can weight the HR/OR by the inverse of the variance (which is SE²).
The weighted mean HR/OR is then the sum of each weighted HR/OR divided by the sum of the weights.
Use Weighted Mean as Prior for Bayesian Adjustment:
Use the weighted global mean HR/OR as the mean for your prior distribution in the Bayesian model for each population.
The variance of the prior could be based on the pooled standard errors or a measure of the variability of the HR/OR across populations.
Variance and Weights
$$$$The variance of each log-transformed odds ratio (OR) can be approximated from the confidence interval: \[\text{Variance} \approx \left( \frac{\log(\text{Upper CI}) - \log(\text{Lower CI})}{2 \times 1.96} \right)^2
\]
Based on tAMR variance, the weight for each observation is:
\[
w_i = \frac{1}{\text{Variance}_i}
\]
Weighted Mean The weighted mean is then calculated as:
\[ \sum_{i=1}^{n} w_i^2 \] is the sum of the squared weights, and \[ x_i \] is each individual log-transformed OR.
Code
# Assuming you have a dataframe 'results' with columns 'Population', 'OR', and 'SE'# Calculate variance and weightsresults <- results %>%mutate(log_OR =log(OR),log_Lower =log(Lower),log_Upper =log(Upper),Variance = ((log_Upper - log_Lower) / (2*1.96))^2,Weight =1/ Variance )# Calculate weighted mean and variance for the priorsum_weights <-sum(results$Weight)sum_squared_weights <-sum(results$Weight^2)weighted_mean_log_OR <-sum(results$log_OR * results$Weight) / sum_weightsweighted_variance <- (sum_weights / (sum_weights^2- sum_squared_weights)) *sum(results$Weight * (results$log_OR - weighted_mean_log_OR)^2)prior_mean <- weighted_mean_log_ORprior_variance <- weighted_variance# Perform Bayesian updating for each populationresults <- results %>%mutate(Posterior_Variance =1/ (1/ prior_variance +1/ Variance),Posterior_Mean = Posterior_Variance * (prior_mean / prior_variance + log_OR / Variance) )# Exponentiating the posterior mean to get back to the OR scaleresults$Posterior_OR <-exp(results$Posterior_Mean)results <- results %>%mutate(Posterior_Lower_Log = Posterior_Mean -1.96*sqrt(Posterior_Variance),Posterior_Upper_Log = Posterior_Mean +1.96*sqrt(Posterior_Variance),Posterior_Lower_OR =exp(Posterior_Lower_Log),Posterior_Upper_OR =exp(Posterior_Upper_Log) )# Now 'results' contains the posterior mean and variance for each populationprint(results)
library(forestplot)library(ggplot2)library(dplyr)# Assuming 'results' is already loaded with your data# We need to create a new data frame for ggplot# First, we create two separate data frames for original and posterior results and then bind them together# plot_data <- results %>%# select(Population, OR, Lower, Upper, Posterior_OR, Posterior_Lower_OR, Posterior_Upper_OR) %>%# gather(key = "Type", value = "Value", -Population, -Lower, -Upper, -Posterior_Lower_OR, -Posterior_Upper_OR) %>%# mutate(# ymin = ifelse(Type == "OR", Lower, Posterior_Lower_OR),# ymax = ifelse(Type == "OR", Upper, Posterior_Upper_OR),# Type = factor(Type, levels = c("OR", "Posterior_OR"))# )# # # Now we plot with ggplot2# ggplot(plot_data, aes(y = Population, x = Value, xmin = ymin, xmax = ymax, color = Type)) +# geom_errorbarh(height = 0.2) +# geom_point(size = 2.5) +# scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +# theme_minimal() + lims(x=c(0.5,2.5))+# labs(x = "Odds Ratio", y = "", color = "Estimate Type") +# theme(legend.position = "bottom")# Adding an identifier for each rowresults$ID <-seq_along(results$Population)results$ID <-seq_along(results$Population)# Create a long-format data frame for plottingplot_data <- tidyr::pivot_longer( results, cols =c("OR", "Posterior_OR"), names_to ="Estimate_Type", values_to ="Value") %>%mutate(ymin =ifelse(Estimate_Type =="OR", Lower, Posterior_Lower_OR),ymax =ifelse(Estimate_Type =="OR", Upper, Posterior_Upper_OR),Estimate_Type =factor(Estimate_Type, levels =c("OR", "Posterior_OR")),# Adjust the y position of the posterior estimatesypos =as.numeric(ID) +ifelse(Estimate_Type =="OR", -0.1, 0.1) )# Create the plotggplot(plot_data, aes(x = Value, y = ypos, color = Estimate_Type)) +geom_point(size =3) +geom_errorbar(aes(xmin = ymin, xmax = ymax), width =0.2) +scale_color_manual(values =c("OR"="black", "Posterior_OR"="red")) +labs(x ="Odds Ratio", y ="") +theme_minimal() +theme(axis.text.y =element_text(vjust =0.5), axis.ticks.y =element_blank(),panel.grid.major =element_blank(),panel.grid.minor =element_blank(),legend.position ="bottom") +scale_y_continuous(breaks =1:length(unique(plot_data$ID)), labels = results$Population)
We can do tAMR in stan and calculate heterogneiety as well:
Code
library(brms)results <- results %>%mutate(se_log_OR =1/sqrt(Weight))# Given prior study's log OR and its standard errorprior_log_or <-log(1/0.76) # log of the hazard ratio from the studyprior_sd <- (log(1/0.64) -log(1/0.91)) / (2*1.96) # standard error of the log OR# or try tAMR:# # rosuvastatin_with_event <- 235# rosuvastatin_without_event <- 6361 - rosuvastatin_with_event# # placebo_with_event <- 304# placebo_without_event <- 6344 - placebo_with_event# # # Compute the OR# odds_case <- rosuvastatin_with_event / rosuvastatin_without_event# odds_control <- placebo_with_event / placebo_without_event# OR <- odds_control / odds_case# # # Convert OR to log scale for the prior# prior_log_or <- log(OR)# # # Calculate standard error of the log OR# prior_log_or_se <- sqrt(1 / placebo_with_event +# 1 / placebo_without_event +# 1 / rosuvastatin_with_event +# 1 / rosuvastatin_without_event)# # # Print the prior log OR and its standard error# print(prior_log_or)# print(prior_log_or_se)# # # Print the OR# print(OR)# # Define the priors based on the prior study for the global effect size#prior_effect <- set_prior(paste("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")prior_effect <-set_prior(paste("normal(", weighted_mean_log_OR, ", ", sqrt(weighted_variance), ")"), class ="Intercept")# Define a non-informative prior for the heterogeneityprior_heterogeneity <-set_prior("student_t(3, 0, 10)", class ="sd", group ="Population")prior_global <-set_prior(paste0("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class ="Intercept")prior_heterogeneity <-set_prior("student_t(3, 0, 10)", class ="sd",group ="Population")# Fit the modelmeta_analysis_model <-brm(bf(log_OR |se(se_log_OR) ~1+ (1| Population)),data = results,prior =c(prior_global, prior_heterogeneity),family =gaussian(),chains =4,iter =4000,warmup =2000,control =list(adapt_delta =0.999999) # adjust for convergence)# Check the summary of the modelsummary(meta_analysis_model)# Extract population-specific effectsranef(meta_analysis_model)global_effect=fixef(meta_analysis_model)[1,1]# Get the random effects (population-specific deviations)population_effects <-ranef(meta_analysis_model)# Calculate the population-specific log ORs by adding the global effect to the random effectspopulation_specific_log_ORs <- global_effect + population_effects$Population[,,"Intercept"][,"Estimate"]# Convert the log ORs to ORspopulation_specific_ORs <-exp(population_specific_log_ORs)# Now you have the population-specific ORsprint(population_specific_ORs)
Heterogeneity assessment:
Code
# Extract the standard deviation of the random effectsheterogeneity_estimate <-VarCorr(meta_analysis_model)$Population$sd# Print the heterogeneity estimateprint(heterogeneity_estimate)# Conduct posterior predictive checkspp_check(meta_analysis_model)# If you want to plot the distribution of the standard deviation of the random effectsposterior_samples <-posterior_samples(meta_analysis_model, pars ="sd_Population__Intercept")AMRt(posterior_samples[, "sd_Population__Intercept"], breaks =40, main ="Distribution of Heterogeneity (sd Population)",x="Heterogeneity (SD)",fill="blue")# Summarize the posterior distribution of the heterogeneity parameterposterior_summary <-summary(posterior_samples[, "sd_Population__Intercept"])print(posterior_summary)