library(dplyr)library(broom)library(tidyr)library(ggplot2)library(data.table)library(ggridges)load("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")pheno_prs=dfheth=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/ethnicity.rds")eth=eth[!is.na(eth$f.21000.0.0),]eth=eth[,c("identifier","meaning")]baseline_levs=readRDS("~/Library/CloudStorage/Dropbox-Personal/dfukb_chol_bp_smoke.rds")baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4PCS=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/baseline_withPCS.rds")[,c(1,20:39)]colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))baseline_levs=merge(baseline_levs,PCS,by="identifier")m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_0_Any","Cad_0_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")m2$Cad_Sta=ifelse(m2$Cad_0_Any==2,1,0)## get these labs readymed_codes=readRDS("~/Library/CloudStorage/Dropbox-Personal/medcodes.rds")#levels(as.factor(med_codes$meaning))## what do you do about meds? I think we need to eliminate, but will skew the distribtuionbaseline_meds=merge(med_codes,baseline_levs,by="identifier")baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)## merge allfinal_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")
Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol:
Code
# final_dat <- final_dat %>%# mutate(recoded_ethnicity = case_when(# meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",# meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",# meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",# meaning == "Chinese" ~ "Chinese",# TRUE ~ "Other" # This will be used for any ethnicity not covered above# ))# # final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)# final_dat$sex=final_dat$f.31.0.0# # table(final_dat$recoded_ethnicity)#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")gae=fread("~/Library/CloudStorage/Dropbox-Personal/ukb.kgp_projected.tsv.gz")final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")table(final_dat$rf80)
AFR AMR EAS EUR SAS
8426 707 2249 448138 9085
Code
# prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.SAS.tsv")# prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.AFR.tsv")# prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.EUR.tsv")# SAS_dat=merge(prs_SAS[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")# AFR_dat=merge(prs_AFR[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")# EUR_dat=merge(prs_EUR[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")## with wallace's prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/SAS_SUM_SCORE_FINAL")prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/AFR_SUM_SCORE_FINAL")prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/EUR_SUM_SCORE_FINAL")colnames(prs_SAS)=colnames(prs_AFR)=colnames(prs_EUR)=c("IID","PRS")SAS_dat=merge(prs_SAS,final_dat,by.x="IID",by.y="identifier")AFR_dat=merge(prs_AFR,final_dat,by.x="IID",by.y="identifier")EUR_dat=merge(prs_EUR,final_dat,by.x="IID",by.y="identifier")pop_data_list=list(SAS_dat,AFR_dat,EUR_dat)names(pop_data_list)=c("SAS","AFR","EUR")
Histograms
We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:
Code
prs_AFR$Population <-'AFR'prs_EUR$Population <-'EUR'prs_SAS$Population <-'SAS'combined_data <-rbind(prs_AFR, prs_EUR, prs_SAS)ggplot(combined_data, aes(x = PRS, y = Population, fill = Population)) +geom_density_ridges(alpha =0.7) +scale_fill_brewer(palette ="Set1") +theme_ridges() +# Optional: Adds a nice theme suitable for ridgeline plotslabs(title ="Density Distribution of Sum Scores",x ="Sum Score",y ="Population")
Picking joint bandwidth of 0.0191
Impact of PRS
To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. This approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach this analysis for both European (EUR) and African (AFR) populations:
Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from this model to reflect a 40 mg/dL change in LDL-C.
Step 2: Associate Rescaled LDL-C PRS with CAD
Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.
Here’s an example in R for the European population. The same steps would be repeated for the African population:
Code
library(dplyr)library(survival)# Assuming 'df' is your dataset# List of unique populationspopulations <-c("AFR","SAS","EUR")# Initialize an empty list for storing models and resultscox_models <-list()for(pop in populations) {# Subset data for the specific population#pop_data <- final_dat %>% filter(recoded_ethnicity == pop) pop_data=pop_data_list[[pop]]## scale for each population pop_data$PRS=scale(pop_data$PRS)# Step 1: Linear Regression for LDL-C Levelsif(pop=="AFR"){ ldl_model <-lm(ldl_adj ~ PRS +as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)print(paste("LDL_coef_CI:",pop,":",confint(ldl_model,parm ="PRS"))) }else{ ldl_model <-lm(ldl_adj ~ PRS +as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10#+as.factor(array) , data = pop_data)print(paste("LDL_coef_CI:",pop,":",confint(ldl_model,parm ="PRS"))) }# Calculate the PRS scaling factor prs_scale_factor <-40/coef(ldl_model)["PRS"]# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor# Step 2: Cox Model for CAD cox_model <-coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs+as.numeric(enrollage)+as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)#cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial") cox_models[[pop]] <- cox_model}
# Create a data frame to store the resultsresults <-data.frame(Population =character(),HR =numeric(),Lower =numeric(),Upper =numeric(),stringsAsFactors =FALSE)#Extract the results from each modelfor (pop innames(cox_models)) { model_summary <-summary(cox_models[[pop]]) hr <- model_summary$coefficients["scaled_ldl_prs", "exp(coef)"] lower <- model_summary$conf.int["scaled_ldl_prs","lower .95"] upper <- model_summary$conf.int["scaled_ldl_prs","upper .95"]# Add to the results data frame results <-rbind(results, data.frame(Population = pop, HR = hr, Lower = lower, Upper = upper))}ggplot(results, aes(x = Population, y = HR, ymin = Lower, ymax = Upper)) +geom_point() +# Add points for hazard ratiosgeom_errorbar(aes(ymin = Lower, ymax = Upper), width =0.1) +# Add error barscoord_flip() +# Flip coordinates for a horizontal plotxlab("Population") +ylim(c(0.5,2.5))+ylab("Hazard Ratio (HR)") +ggtitle("Forest Plot of Hazard Ratios for LDL-C PRS (per 40 mg/dL) on CAD across Populations") +theme_minimal()
And now with OR/GLM:
Code
# Extract the results from each model# library(broom)# library(dplyr)for(pop in populations) { pop_data=pop_data_list[[pop]] pop_data$PRS=scale(pop_data$PRS)if(pop=="AFR"){ ldl_model <-lm(ldl_adj ~ PRS +as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)print(length(coef(ldl_model))) }else{ldl_model <-lm(ldl_adj ~ PRS +as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10#+as.factor(array), ,data = pop_data)print(length(coef(ldl_model)))}# Calculate the PRS scaling factor prs_scale_factor <-40/coef(ldl_model)["PRS"]# Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor cox_model <-glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) +as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family ="binomial") cox_models[[pop]] <- cox_model}
[1] 14
[1] 14
[1] 14
Code
# Access and summarize the Cox models for each populationlapply(cox_models, function(x){exp(coef(x)["scaled_ldl_prs"])})