Ancestry PRS Load

Author

Sarah Urbut

Published

January 7, 2024

LDL PRS analyais

Code

library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
load("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/output/merged_pheno_censor_final_withdrugs_smoke.rds")
pheno_prs=dfh
eth=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/ethnicity.rds")
eth=eth[!is.na(eth$f.21000.0.0),]
eth=eth[,c("identifier","meaning")]


baseline_levs=readRDS("~/Library/CloudStorage/Dropbox-Personal/dfukb_chol_bp_smoke.rds")
baseline_levs$hdl=baseline_levs$f.30760.0.0*38.4
baseline_levs$ldl=baseline_levs$f.30780.0.0*38.4

PCS=readRDS("~/Library/CloudStorage/Dropbox-Personal/pheno_dir/baseline_withPCS.rds")[,c(1,20:39)]
colnames(PCS)=c("identifier",paste0(rep("PC",20),seq(1:20)))
baseline_levs=merge(baseline_levs,PCS,by="identifier")
m2=merge(pheno_prs[,c("identifier","Birthdate","enrollage","Cad_0_Any","Cad_0_censor_age","f.31.0.0")],eth,by.x="identifier",by.y="identifier")

m2$Cad_Sta=ifelse(m2$Cad_0_Any==2,1,0)

## get these labs ready
med_codes=readRDS("~/Library/CloudStorage/Dropbox-Personal/medcodes.rds")
#levels(as.factor(med_codes$meaning))
## what do you do about meds? I think we need to eliminate, but will skew the distribtuion
baseline_meds=merge(med_codes,baseline_levs,by="identifier")
baseline_meds$ldl_adj=ifelse(baseline_meds$meaning%in%"Cholesterol lowering medication",baseline_meds$ldl/0.8,baseline_meds$ldl)
baseline_meds$sbp_adj=ifelse(baseline_meds$meaning%in%"Blood pressure medication",baseline_meds$f.4080.0.0+10,baseline_meds$f.4080.0.0)

## merge all
final_dat=merge(m2,baseline_meds[,c("identifier","sbp_adj","ldl_adj","hdl",paste0(rep("PC",10),seq(1:10)))],by.x="identifier",by.y="identifier")

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol:

Code

# final_dat <- final_dat %>%
#   mutate(recoded_ethnicity = case_when(
#     meaning %in% c("African", "Any other Black background", "Black or Black British","Caribbean","Any other mixed background","Mixed") ~ "AFR",
#     meaning %in% c("White", "Irish", "Any other white background","British") ~ "White",
#     meaning %in% c("Indian", "Pakistani","Bangladeshi") ~ "South Asian",
#     meaning == "Chinese" ~ "Chinese",
#     TRUE ~ "Other" # This will be used for any ethnicity not covered above
#   ))
# 
# final_dat$Cad_Sta=ifelse(final_dat$Cad_0_Any==2,1,0)
# final_dat$sex=final_dat$f.31.0.0
# 
# table(final_dat$recoded_ethnicity)
#ggplot(lt,aes(x=Var1,y=value,fill=Var1))+geom_bar(stat="identity")

gae=fread("~/Library/CloudStorage/Dropbox-Personal/ukb.kgp_projected.tsv.gz")
final_dat=merge(final_dat,gae[,c("eid","rf80")],by.x="identifier",by.y="eid")
table(final_dat$rf80)


   AFR    AMR    EAS    EUR    SAS 
  8426    707   2249 448138   9085

Code

# prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.SAS.tsv")
# prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.AFR.tsv")
# prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/ukbb_ldlcprs.EUR.tsv")

# SAS_dat=merge(prs_SAS[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")
# AFR_dat=merge(prs_AFR[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")
# EUR_dat=merge(prs_EUR[,c(1:5,20)],final_dat,by.x="eid",by.y="identifier")

## with wallace's 

prs_SAS=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/SAS_SUM_SCORE_FINAL")
prs_AFR=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/AFR_SUM_SCORE_FINAL")
prs_EUR=fread("~/Library/CloudStorage/Dropbox-Personal/PRSethnicity/EUR_SUM_SCORE_FINAL")

colnames(prs_SAS)=colnames(prs_AFR)=colnames(prs_EUR)=c("IID","PRS")

SAS_dat=merge(prs_SAS,final_dat,by.x="IID",by.y="identifier")
AFR_dat=merge(prs_AFR,final_dat,by.x="IID",by.y="identifier")
EUR_dat=merge(prs_EUR,final_dat,by.x="IID",by.y="identifier")


pop_data_list=list(SAS_dat,AFR_dat,EUR_dat)
names(pop_data_list)=c("SAS","AFR","EUR")

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code

prs_AFR$Population <- 'AFR'
prs_EUR$Population <- 'EUR'
prs_SAS$Population <- 'SAS'

combined_data <- rbind(prs_AFR, prs_EUR, prs_SAS)

ggplot(combined_data, aes(x = PRS, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.0191

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. This approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach this analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from this model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code

library(dplyr)
library(survival)

# Assuming 'df' is your dataset

# List of unique populations
populations <- c("AFR","SAS","EUR")

# Initialize an empty list for storing models and results
cox_models <- list()

for(pop in populations) {
  # Subset data for the specific population
  #pop_data <- final_dat %>% filter(recoded_ethnicity == pop)
  pop_data=pop_data_list[[pop]]
  ## scale for each population
  pop_data$PRS=scale(pop_data$PRS)
  # Step 1: Linear Regression for LDL-C Levels
  if(pop=="AFR"){
     ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
     print(paste("LDL_coef_CI:",pop,":",confint(ldl_model,parm = "PRS")))
  }
  else{
    ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10#+as.factor(array)
                    , data = pop_data)
  print(paste("LDL_coef_CI:",pop,":",confint(ldl_model,parm = "PRS")))
  }
  
  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  # Step 2: Cox Model for CAD
  cox_model <- coxph(Surv(Cad_0_censor_age, Cad_Sta) ~ scaled_ldl_prs+ as.numeric(enrollage)+as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
  #cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
}

[1] "LDL_coef_CI: AFR : 9.00864843839882" "LDL_coef_CI: AFR : 9.19800270887616"
[1] "LDL_coef_CI: SAS : 8.46203833857587" "LDL_coef_CI: SAS : 8.6406298276037" 
[1] "LDL_coef_CI: EUR : 11.4831889619901" "LDL_coef_CI: EUR : 11.6565271139168"

Code

# Access and summarize the Cox models for each population
lapply(cox_models, function(x){exp(coef(x)["scaled_ldl_prs"])})

$AFR
scaled_ldl_prs 
      1.533326 

$SAS
scaled_ldl_prs 
      1.473347 

$EUR
scaled_ldl_prs 
      1.505904

Now let’s look at a meta analysis:

Code

# Create a data frame to store the results
results <- data.frame(Population = character(),
                      HR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

 #Extract the results from each model
 for (pop in names(cox_models)) {
   model_summary <- summary(cox_models[[pop]])
   hr <- model_summary$coefficients["scaled_ldl_prs", "exp(coef)"]
  lower <- model_summary$conf.int["scaled_ldl_prs","lower .95"]
  upper <- model_summary$conf.int["scaled_ldl_prs","upper .95"]

  # Add to the results data frame
  results <- rbind(results, data.frame(Population = pop, HR = hr, Lower = lower, Upper = upper))
}

ggplot(results, aes(x = Population, y = HR, ymin = Lower, ymax = Upper)) +
  geom_point() + # Add points for hazard ratios
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) + # Add error bars
  coord_flip() + # Flip coordinates for a horizontal plot
  xlab("Population") +ylim(c(0.5,2.5))+
  ylab("Hazard Ratio (HR)") +
  ggtitle("Forest Plot of Hazard Ratios for LDL-C PRS (per 40 mg/dL) on CAD  across Populations") +
  theme_minimal()

And now with OR/GLM:

Code

# Extract the results from each model
# library(broom)
# library(dplyr)

for(pop in populations) {
 
  pop_data=pop_data_list[[pop]]
  pop_data$PRS=scale(pop_data$PRS)

  
   if(pop=="AFR"){
     ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data)
     print(length(coef(ldl_model)))
  }
  else{ldl_model <- lm(ldl_adj ~ PRS + as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10#+as.factor(array),
                       ,data = pop_data)
  print(length(coef(ldl_model)))}
  
  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  

    cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+as.numeric(enrollage) + as.factor(f.31.0.0)+PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
}

[1] 14
[1] 14
[1] 14

Code

# Access and summarize the Cox models for each population
lapply(cox_models, function(x){exp(coef(x)["scaled_ldl_prs"])})

$AFR
scaled_ldl_prs 
      1.578216 

$SAS
scaled_ldl_prs 
      1.509432 

$EUR
scaled_ldl_prs 
       1.54848

Code

# 
results <- data.frame(Population = character(),
                      OR = numeric(),
                      Lower = numeric(),
                      Upper = numeric(),
                      stringsAsFactors = FALSE)

for (pop in names(cox_models)) {
    model_summary <- tidy(cox_models[[pop]], conf.int = TRUE, exponentiate = TRUE)

    if ("scaled_ldl_prs" %in% model_summary$term) {
        term_row <- model_summary %>% filter(term == "scaled_ldl_prs")

        results <- rbind(results, data.frame(Population = pop,
                                             OR = term_row$estimate,
                                             Lower = term_row$conf.low,
                                             Upper = term_row$conf.high))
    } else {
        results <- rbind(results, data.frame(Population = pop, OR = NA, Lower = NA, Upper = NA))
    }
}
# # Create the forest plot using ggplot2
# 
ggplot(results, aes(x = Population, y = OR, ymin = Lower, ymax = Upper)) +
    geom_point() +
    geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) +
    coord_flip()+ylim(c(0.5,2.5))+
    xlab("Population") +
    ylab("Odds Ratio (OR)") +
    ggtitle("Forest Plot of Odds Ratios for LDL-C PRS (per 40 mg/dL) on CAD across Populations") +
    theme_minimal()