Ancestry PRS Load

Author

Sarah Urbut

Published

March 7, 2024

LDL PRS analyais

Code

library(dplyr)
library(broom)
library(tidyr)
library(ggplot2)
library(data.table)
library(ggridges)
library(lubridate)
library(stringr)
library(UpSetR)


prs = fread("LDL_MultiAncestry.hg38.MGBB.sscore")
ancestry = fread("mgbb_data/GSA_53K.hg38.1KGPCA.tsv.gz")[, c(1:3)]
s = stringr::str_split_fixed(ancestry$IID, pattern = "-", n = 2)


ldlwithinon = fread("mgbb_data/mgbb_ldl_consentdate.csv")
linker = fread("mgbb_data/all_cvd_phenos.mgbb.tsv")
ldl = merge(linker[, c("IID", "EMPI", "Subject_Id")], ldlwithinon, by =
              "EMPI", all.x = T)

## we will use consent date

pheno_incident = fread("mgbb_data/mgbb_incident_mi_consentdate.csv")
pheno_incident$incident_mi_date = pheno_incident$Date
pheno_has = fread("mgbb_data/mgbb_prevalent_mi_consentdate.csv")
pheno_has$prevalent_mi_date = pheno_has$Date




# pheno_incident = fread("mgbb_data/mgbb_incident_cad_dia_proc_consentdate.csv")
# pheno_incident$incident_mi_date = pheno_incident$incident_cad_date
# pheno_has = fread("mgbb_data/mgbb_prevalent_cad_dia_proc_consentdate.csv")
# pheno_has$prevalent_mi_date = pheno_has$prevalent_cad_date 
# 
# pheno_incident$incident_mi=pheno_incident$incident_cad 
# pheno_has$prevalent_mi=pheno_has$prevalent_cad 
# 


sum(pheno_incident$incident_mi == 1)

[1] 21358

Code

sum(pheno_has$prevalent_mi == 1)

[1] 11293

Code

pheno_has$inbiobank = ifelse(pheno_has$Subject_Id %in% s[, 2], 1, 0)
pheno_has %>% group_by(inbiobank) %>% summarise(mean(prevalent_mi))

# A tibble: 2 × 2
  inbiobank `mean(prevalent_mi)`
      <dbl>                <dbl>
1         0               0.0657
2         1               0.101

Code

pheno_incident$inbiobank = ifelse(pheno_incident$Subject_Id %in% s[, 2], 1, 0)
pheno_incident %>% group_by(inbiobank) %>% summarise(mean(incident_mi))

# A tibble: 2 × 2
  inbiobank `mean(incident_mi)`
      <dbl>               <dbl>
1         0               0.121
2         1               0.198

Code

baseline_covs = fread("mgbb_data/mgbb_pce_consent_date.csv")
pheno_all = left_join(pheno_incident, pheno_has[, c("Subject_Id", "prevalent_mi_date", "prevalent_mi")], by = "Subject_Id")

pheno_all_with_covs = left_join(pheno_all, baseline_covs[, c("EMPI",
                                                             "mgbb_consent_date",
                                                             "enrollment_age",
                                                             "sex_imputed",
                                                             "lipidmed")])


## we can see that the rates are much higher for biobank particants
pheno_all$either = ifelse((pheno_all$prevalent_mi + pheno_all$incident_mi) >
                            0, 1, 0)
pheno_all %>% group_by(inbiobank) %>% summarise(mean(either))

# A tibble: 2 × 2
  inbiobank `mean(either)`
      <dbl>          <dbl>
1         0          0.156
2         1          0.237

Code

PCS = fread("mgbb_data/GSA_53K.hg38.insamplePCA.tsv.gz")
pheno_dat = merge(pheno_all_with_covs, ldl, by = "EMPI")
dat_with_PCS = merge(PCS[, c(1:11)], pheno_dat, by = "IID")
final_dat = merge(ancestry, dat_with_PCS, by = "IID")



final_dat <- final_dat %>%
  mutate(
    mgbb_consent_date = as.Date(mgbb_consent_date, "%Y-%m-%d"),
    incident_cad_date = as.Date(incident_mi_date, "%Y-%m-%d"),
    prevalent_cad_date = as.Date(prevalent_mi_date, "%Y-%m-%d"),
    birthdate = mgbb_consent_date - (enrollment_age * 365.25),
    # Approximating leap years
    age_at_incident_cad = as.numeric(difftime(incident_mi_date, birthdate, units = "days")) / 365.25,
    age_at_prevalent_cad = ifelse(is.na(prevalent_mi_date), NA, as.numeric(
      difftime(prevalent_cad_date, birthdate, units = "days")
    ) / 365.25)
  )


final_dat <- final_dat %>%
  mutate(
    # Calculate censor flag
    CAD_censor = ifelse(incident_mi == 1 | prevalent_mi == 1, 1, 0),
    # Calculate age at censoring or follow-up
    CAD_censor_age = case_when(
      # Use age at prevalent CAD if prevalent CAD exists!is.na(age_at_prevalent_cad) &
      prevalent_mi == 1 ~ age_at_prevalent_cad,
      # Use age at incident CAD if no prevalent available!is.na(age_at_incident_cad) &
      incident_mi == 1 ~ age_at_incident_cad,
      # Calculate age at follow-up end date if no CAD
      TRUE ~ as.numeric(difftime(as.Date("2023-06-15"), birthdate, units = "days")) / 365.25
    )
  )


### note that only 26000 people had LDL
final_dat$ldladj = ifelse(final_dat$lipidmed == "yes",
                          final_dat$ldlc * 5 / 4,
                          final_dat$ldlc)

Phenotyping

We validated previously that MGB pheno implementation of Hard CAD is matched to HARD Cad. So we will now use that here to be c/w AoU HARD CAD pheno.

Because we are using age as time scale (or GLM with ‘evere event’ follow up time is really just time from birth to death.
We know that time is inhomogenous: even if we use OR instead of HR, the OR will tend to decrease over time because there are more non-genetically mediated CAD events

Now we want to calculate ancestry specific thresholds for PRS. Let’s get the PRS for African, European, and Asian ancestry that causes a 40 mmHg increase in LDL cholesterol in the UK Biobank using the AFR, SAS, and EUR individauls present.

Code

SAS_dat=final_dat[final_dat$knn%in%"SAS",]
AFR_dat=final_dat[final_dat$knn%in%"AFR",]
EUR_dat=final_dat[final_dat$knn%in%"EUR",]
AMR_dat=final_dat[final_dat$knn%in%"AMR",]
EAS_dat=final_dat[final_dat$knn%in%"EAS",]


pop_data_list=list(SAS_dat,AFR_dat,EUR_dat,AMR_dat,EAS_dat)
names(pop_data_list)=c("SAS","AFR","EUR","AMR","EAS")

final_dat=final_dat[!is.na(final_dat$knn),]
final_dat%>%group_by(knn)%>%summarise("cad_rate"=mean(CAD_censor),"mean_ldl"=mean(na.omit(ldlc)),"%M"=mean(na.omit(sex_imputed=="male")),"meanAge"=as.numeric(mean(enrollment_age)),"population_size"=length(knn))

# A tibble: 5 × 6
  knn   cad_rate mean_ldl  `%M` meanAge population_size
  <chr>    <dbl>    <dbl> <dbl>   <dbl>           <int>
1 AFR      0.394     104. 0.370    48.7            2890
2 AMR      0.272     102. 0.343    42.2            3849
3 EAS      0.103     101. 0.372    41.9            1061
4 EUR      0.228     102. 0.458    55.1           44870
5 SAS      0.149     101. 0.514    41.4             605

Histograms

We note that the distributions of these PRS are skewed for non EUR pops but will normalize for our analysis:

Code

prs_merged = inner_join(ancestry, prs, by = c("IID" = "#IID"))


# Filter for EUR and select relevant columns
prs_EUR <- prs_merged %>%
  filter(knn == "EUR") %>%
  select(IID, LDL.MultiAncestry.hg38.EUR_SUM)

# Filter for AFR and select relevant columns
prs_AFR <- prs_merged %>%
  filter(knn == "AFR") %>%
  select(IID, LDL.MultiAncestry.hg38.AFR_SUM)



# Filter for EAS and select relevant columns
prs_EAS <- prs_merged %>%
  filter(knn == "EAS") %>%
  select(IID, LDL.MultiAncestry.hg38.EAS_SUM)

# Filter for HIS or SAS and select relevant columns
# Assuming HIS refers to Hispanic and SAS refers to South Asian
prs_AMR <- prs_merged %>%
  filter(knn == "AMR") %>%
  select(IID, LDL.MultiAncestry.hg38.HISP_SUM)

prs_SAS <- prs_merged %>%
  filter(knn == "SAS") %>%
  select(IID, LDL.MultiAncestry.hg38.SAS_SUM)


colnames(prs_SAS) = colnames(prs_AFR) = colnames(prs_EUR) = colnames(prs_EAS) =
  colnames(prs_AMR) = c("IID", "PRS")


SAS_dat = merge(prs_SAS, final_dat[final_dat$knn %in% "SAS", ], by = "IID")
AFR_dat = merge(prs_AFR, final_dat[final_dat$knn %in% "AFR", ], by = "IID")
EUR_dat = merge(prs_EUR, final_dat[final_dat$knn %in% "EUR", ], by = "IID")
AMR_dat = merge(prs_AMR, final_dat[final_dat$knn %in% "AMR", ], by = "IID")
EAS_dat = merge(prs_EAS, final_dat[final_dat$knn %in% "EAS", ], by = "IID")


pop_data_list = list(SAS_dat, AFR_dat, EUR_dat, AMR_dat, EAS_dat)
names(pop_data_list) = c("SAS", "AFR", "EUR", "AMR", "EAS")

totaldata = do.call(rbind, pop_data_list)

ggplot(totaldata, aes(x = PRS, y = knn, fill = knn)) +
  geom_density_ridges(alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +
  theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
  labs(title = "Density Distribution of Sum Scores",
       x = "Sum Score",
       y = "Population")

Picking joint bandwidth of 0.0298

Using residual PRS

Code

pcmod = lm(PRS ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10, data = totaldata)
totaldata$predictscore = predict(object = pcmod, newdata = totaldata)
totaldata$residscore = totaldata$PRS - totaldata$predictscore
totaldata$Population = as.factor(totaldata$knn)
totaldata$scaled_resid = scale(totaldata$residscore)


ggplot(totaldata, aes(x = residscore, y = Population, fill = Population)) +
  geom_density_ridges(alpha = 0.7) +
  scale_fill_brewer(palette = "Set1") +
  theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
  labs(title = "Density Distribution of Sum Scores",
       x = "Sum Score",
       y = "Population")

Picking joint bandwidth of 0.0297

Code

totaldata$pop_scaled_prs=rep(0,nrow(totaldata))
totaldata$pop_scaled_prs[totaldata$Population%in%"AFR"]=scale(totaldata$residscore[totaldata$Population%in%"AFR"])
totaldata$pop_scaled_prs[totaldata$Population%in%"EUR"]=scale(totaldata$residscore[totaldata$Population%in%"EUR"])
totaldata$pop_scaled_prs[totaldata$Population%in%"SAS"]=scale(totaldata$residscore[totaldata$Population%in%"SAS"])
totaldata$pop_scaled_prs[totaldata$Population%in%"EAS"]=scale(totaldata$residscore[totaldata$Population%in%"EAS"])
totaldata$pop_scaled_prs[totaldata$Population%in%"AMR"]=scale(totaldata$residscore[totaldata$Population%in%"AMR"])

summary(totaldata$pop_scaled_prs)

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-5.22464 -0.58991  0.09983  0.00000  0.68542  3.85568

Code

ggplot(totaldata, aes(x = pop_scaled_prs, y = Population, fill = Population)) +
    geom_density_ridges(alpha = 0.7) +
    scale_fill_brewer(palette = "Set1") +
    theme_ridges() + # Optional: Adds a nice theme suitable for ridgeline plots
    labs(title = "Density Distribution of Sum Scores: Scaled for all",
         x = "Sum Score",
         y = "Population")

Picking joint bandwidth of 0.16

Code

library(dplyr)
totaldata%>%group_by(Population)%>%summarize(quantile(pop_scaled_prs,0.25),median(pop_scaled_prs),mean(pop_scaled_prs),quantile(pop_scaled_prs,0.75))

# A tibble: 5 × 5
  Population `quantile(pop_scaled_prs, 0.25)` `median(pop_scaled_prs)`
  <fct>                                 <dbl>                    <dbl>
1 AFR                                  -0.443                   0.239 
2 AMR                                  -0.608                   0.0800
3 EAS                                  -0.231                   0.276 
4 EUR                                  -0.599                   0.0840
5 SAS                                  -0.566                   0.0690
# ℹ 2 more variables: `mean(pop_scaled_prs)` <dbl>,
#   `quantile(pop_scaled_prs, 0.75)` <dbl>

Let’s look at the departure from PH:

Code

library(survminer)

Loading required package: ggpubr

Code

library(survival)


Attaching package: 'survival'

The following object is masked from 'package:survminer':

    myeloma

Code

fit3 <- survfit( Surv(time = CAD_censor_age, CAD_censor) ~ Population,data=totaldata)
ggsurvplot(fit3, data = totaldata, conf.int = FALSE,fun = "event",
risk.table = TRUE, risk.table.col="strata",
ggtheme = theme_classic(),xlab="Age",palette = "nejm",xlim=c(0,90))

Code

totaldata%>%group_by(Population)%>%summarize("event_age"=mean(CAD_censor_age[CAD_censor==1]),"censor_age"=mean(CAD_censor_age[CAD_censor==0]),"event_rate"=mean(CAD_censor==1),"oldest_event"=max(CAD_censor_age[CAD_censor==1]),"oldest_control"=max(CAD_censor_age[CAD_censor==1]))

# A tibble: 5 × 6
  Population event_age censor_age event_rate oldest_event oldest_control
  <fct>          <dbl>      <dbl>      <dbl>        <dbl>          <dbl>
1 AFR             51.6       53.2      0.394         95.8           95.8
2 AMR             48.5       46.8      0.272         95.6           95.6
3 EAS             53.1       46.7      0.103         95.9           95.9
4 EUR             60.8       59.7      0.228         99.5           99.5
5 SAS             53.4       45.7      0.149         90.4           90.4

Table1

Code

totaldata$followup=round(difftime(totaldata$CAD_censor_age,totaldata$enrollment_age),2)
library(table1)


Attaching package: 'table1'

The following objects are masked from 'package:base':

    units, units<-

Code

totaldata$followup <- round(difftime(totaldata$CAD_censor_age, totaldata$enrollment_age), 2)
dat <- totaldata

label(dat$sex_imputed)="sex"
label(dat$CAD_censor)="CAD"
dat$CAD_censor=as.factor(dat$CAD_censor)
#table1(~sex_imputed+ldladj+CAD_censor+enrollment_age+pop_scaled_prs+scaled_resid|knn,data = totaldata)
# Format the pop_scaled_prs variable to 2 decimal places
dat$pop_scaled_prs <- as.numeric(as.character(round(dat$pop_scaled_prs,2)))

dat <- totaldata
dat$CAD_censor <- as.factor(dat$CAD_censor)
label(dat$sex_imputed) <- "Sex"
label(dat$ldladj) <- "LDL-C (adjusted for Statin)"
label(dat$CAD_censor) <- "CAD Ever"
label(dat$scaled_resid) <- "Overall Scaled Residual PRS"
label(dat$pop_scaled_prs) <- "Pop Scaled Residual PRS"
label(dat$enrollment_age) = "Age Enrolled"

# Continue with creating the table
table1(~sex_imputed + ldladj + CAD_censor + enrollment_age + round(pop_scaled_prs,4) + scaled_resid | knn, data = dat)

	AFR (N=2890)	AMR (N=3849)	EAS (N=1061)	EUR (N=44870)	SAS (N=605)	Overall (N=53275)
Sex
female	1821 (63.0%)	2528 (65.7%)	666 (62.8%)	24340 (54.2%)	294 (48.6%)	29649 (55.7%)
male	1069 (37.0%)	1321 (34.3%)	395 (37.2%)	20530 (45.8%)	311 (51.4%)	23626 (44.3%)
LDL-C (adjusted for Statin)
Mean (SD)	113 (48.1)	109 (44.4)	105 (36.4)	110 (37.9)	105 (36.1)	110 (39.1)
Median [Min, Max]	106 [11.0, 828]	104 [15.0, 1090]	101 [26.3, 246]	106 [12.5, 568]	99.0 [25.0, 235]	106 [11.0, 1090]
Missing	1223 (42.3%)	1974 (51.3%)	598 (56.4%)	22676 (50.5%)	322 (53.2%)	26793 (50.3%)
CAD Ever
0	1750 (60.6%)	2801 (72.8%)	952 (89.7%)	34652 (77.2%)	515 (85.1%)	40670 (76.3%)
1	1140 (39.4%)	1048 (27.2%)	109 (10.3%)	10218 (22.8%)	90 (14.9%)	12605 (23.7%)
Age Enrolled
Mean (SD)	48.7 (16.3)	42.2 (15.6)	41.9 (16.0)	55.1 (16.9)	41.4 (15.9)	53.4 (17.2)
Median [Min, Max]	49.4 [17.8, 98.4]	40.1 [1.65, 95.5]	38.6 [17.9, 89.1]	57.7 [1.98, 102]	37.3 [3.66, 85.9]	55.8 [1.65, 102]
Pop Scaled Residual PRS
Mean (SD)	-0.0000000692 (1.00)	-0.0000000779 (1.00)	-0.00000189 (1.00)	0.0000000713 (1.00)	0.00000116 (1.00)	0.0000000263 (1.00)
Median [Min, Max]	0.239 [-4.82, 2.18]	0.0800 [-4.69, 2.75]	0.276 [-5.22, 1.90]	0.0841 [-5.08, 3.86]	0.0690 [-4.09, 2.77]	0.0998 [-5.22, 3.86]
Overall Scaled Residual PRS
Mean (SD)	-0.0797 (1.25)	0.217 (0.992)	-0.0303 (1.01)	-0.0114 (0.982)	-0.0991 (0.779)	-0.0000000000000000371 (1.00)
Median [Min, Max]	0.219 [-6.12, 2.65]	0.296 [-4.44, 2.95]	0.250 [-5.33, 1.89]	0.0711 [-5.00, 3.78]	-0.0453 [-3.28, 2.06]	0.0986 [-6.12, 3.78]

Impact of PRS

To assess the impact of LDL-C PRS on CAD risk, you need to establish a two-step relationship: first, between LDL-C PRS and LDL-C levels (per 40 mg/dL change), and then between LDL-C PRS and CAD outcomes. TAMR approach allows you to understand how changes in LDL-C levels attributed to genetic risk (as quantified by PRS) relate to CAD risk. Here’s how you can approach tAMR analysis for both European (EUR) and African (AFR) populations:

Step 1: Associate LDL-C PRS with LDL-C Levels (per 40 mg/dL)

Linear Regression for LDL-C Levels: Fit a linear regression model with LDL-C levels as the outcome and LDL-C PRS as the predictor. You might want to adjust for relevant covariates.
Rescale the Coefficient for LDL-C PRS: Adjust the coefficient from tAMR model to reflect a 40 mg/dL change in LDL-C.

Step 2: Associate Rescaled LDL-C PRS with CAD

Incorporate Rescaled LDL-C PRS into Cox Model: Use the rescaled LDL-C PRS from Step 1 as a predictor in a Cox proportional hazards model to analyze its relationship with CAD risk.
Perform Analysis for Both EUR and AFR Populations: Repeat these steps separately for EUR and AFR populations to capture population-specific effects.

Here’s an example in R for the European population. The same steps would be repeated for the African population:

Code

library(dplyr)
library(survival)
# Assuming 'df' is your dataset

# List of unique populations
populations <- c("AFR", "SAS", "EUR", "AMR", "EAS")

# Initialize an empty list for storing models and results
cox_models <- list()
glm_models = list()


results <- data.frame(
  Population = character(),
  PRS_Mean = numeric(),
  PRS_SD = numeric(),
  Percent_male = numeric(),
  LDL_Coef_CI_Lower = numeric(),
  LDL_Coef_CI_Upper = numeric(),
  Population_Scale_Factor = numeric(),
  LDL_Scaled_Coef_CI_Lower = numeric(),
  LDL_Scaled_Coef_CI_Upper = numeric(),
  HR = numeric(),
  HR_CI_Lower = numeric(),
  HR_CI_Upper = numeric(),
  OR = numeric(),
  OR_LCI = numeric(),
  OR_UCI = numeric(),
  stringsAsFactors = FALSE
)


for (pop in populations) {
  # Subset data for the specific population
  pop_data = totaldata[totaldata$knn %in% pop,]
  
  # scale the residual score per pouplation
  #pop_data$PRS = scale(pop_data$residscore)
  pop_data$PRS = pop_data$pop_scaled_prs
  ##print("PRS Summary")
  #print(summary(pop_data$PRS))
  
  
  ldl_model <-
    lm(
      ldladj ~ PRS + as.numeric(enrollment_age) + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data[!is.na(pop_data$ldladj),]
    )
  
  ldl_coef_ci <- confint(ldl_model, "PRS")
  #print(paste("LDL_coef_CI for population:", pop))
  if ("PRS" %in% names(coef(ldl_model))) {
    #print(confint(ldl_model, parm = "PRS"))
  } else {
    #print("PRS coefficient is not available in the model.")
  }
  
  
  
  # Calculate the PRS scaling factor
  prs_scale_factor <- 40 / coef(ldl_model)["PRS"]
  #print(paste0(pop,":",prs_scale_factor))
  #prs_scale_factor=1
  # Scale the LDL-C PRS so that now the scaled translates to an increase of 40 mgDL
  pop_data$scaled_ldl_prs <- pop_data$PRS / prs_scale_factor
  
  
  ldl_model_2 <-
    lm(
      ldladj ~ scaled_ldl_prs + as.numeric(enrollment_age) + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data[!is.na(pop_data$ldladj),]
    )
  
  ldl_scaled_coef_ci <- confint(ldl_model_2, "scaled_ldl_prs")
  
  
  #print(summary(pop_data$scaled_ldl_prs))
  # Step 2: Cox Model for CAD
  ## since age is time scale don't use
  cox_model <-
    coxph(
      Surv(CAD_censor_age, CAD_censor) ~ scaled_ldl_prs
      + as.factor(sex_imputed) + PC1 + PC2 + PC3 + PC4 +
        PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
      data = pop_data
    )
  #cox_model <- glm(Cad_Sta ~ scaled_ldl_prs+sex, data = pop_data,family = "binomial")
  cox_models[[pop]] <- cox_model
  
  
  
  glm_model <- glm(
    CAD_censor ~ scaled_ldl_prs + enrollment_age+as.factor(sex_imputed) + PC1 +
      PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10,
    data = pop_data,
    family = "binomial"
  )
  glm_models[[pop]] <- glm_model
  glm_models[[pop]] <- glm_model
  log_or = summary(glm_model)$coefficients["scaled_ldl_prs", "Estimate"]
  log_or_se = summary(glm_model)$coefficients["scaled_ldl_prs", "Std. Error"]
  or = round(exp(log_or), 2)
  lci = round(exp(log_or - 1.96 * log_or_se), 2)
  uci = round(exp(log_or + 1.96 * log_or_se), 2)
  
  glmci = confint(glm_model)
  
  # Extract summary stats
  cox_summary <- summary(cox_model)
  glm_summary <- summary(glm_model)
  
  # Compile results into the data frame
  # Append to results
  results <- rbind(
    results,
    data.frame(
      Population = pop,
      PRS_Mean = mean(pop_data$PRS),
      PRS_SD = sd(pop_data$PRS),
      Percent_male = mean(pop_data$sex_imputed == "male"),
      
      
      LDL_Coef_CI_Lower = ldl_coef_ci[1],
      LDL_Coef_CI_Upper = ldl_coef_ci[2],
      Population_Scale_Factor = prs_scale_factor,
      LDL_Scaled_Coef_CI_Lower = ldl_scaled_coef_ci[1],
      # Adjust according to your calculations
      LDL_Scaled_Coef_CI_Upper = ldl_scaled_coef_ci[2],
      HR = cox_summary$conf.int["scaled_ldl_prs", 1],
      HR_CI_Lower = cox_summary$conf.int["scaled_ldl_prs", 3],
      # Assuming 'cox_summary' is properly defined
      HR_CI_Upper = cox_summary$conf.int["scaled_ldl_prs", 4],
      # Assuming 'cox_summary' is properly defined
      OR = or,
      OR_LCI = exp(glmci['scaled_ldl_prs', 1]),
      OR_UCI = exp(glmci['scaled_ldl_prs', 2])
    )
  )
}

Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...
Waiting for profiling to be done...

Code

results

     Population      PRS_Mean PRS_SD Percent_male LDL_Coef_CI_Lower
PRS         AFR  1.534528e-16      1    0.3698962          8.860904
PRS1        SAS  5.615343e-17      1    0.5140496          2.482535
PRS2        EUR -3.976835e-17      1    0.4575440          8.644184
PRS3        AMR  8.405274e-17      1    0.3432060          9.107990
PRS4        EAS -6.320214e-17      1    0.3722903          4.543204
     LDL_Coef_CI_Upper Population_Scale_Factor LDL_Scaled_Coef_CI_Lower
PRS          13.323792                3.606089                 31.95321
PRS1         11.520983                5.712850                 14.18235
PRS2          9.607221                4.383224                 37.88940
PRS3         13.217593                3.583333                 32.63696
PRS4         10.933992                5.168895                 23.48334
     LDL_Scaled_Coef_CI_Upper       HR HR_CI_Lower HR_CI_Upper    OR    OR_LCI
PRS                  48.04679 1.026835   0.8315990    1.267907  0.99 0.7516421
PRS1                 65.81765 5.100317   1.3806525   18.841261 16.92 3.6238440
PRS2                 42.11060 1.171711   1.0749651    1.277164  1.15 1.0409548
PRS3                 47.36304 1.301657   1.0284707    1.647408  1.30 0.9814195
PRS4                 56.51666 1.835839   0.6447584    5.227234  1.87 0.6206754
        OR_UCI
PRS   1.304601
PRS1 85.004439
PRS2  1.273449
PRS3  1.725766
PRS4  6.188360

Code

saveRDS(results, "results_mgb.rds")

# Access and summarize the Cox models for each population
lapply(cox_models, function(x) {
  exp(coef(x)["scaled_ldl_prs"])
})

$AFR
scaled_ldl_prs 
      1.026835 

$SAS
scaled_ldl_prs 
      5.100317 

$EUR
scaled_ldl_prs 
      1.171711 

$AMR
scaled_ldl_prs 
      1.301657 

$EAS
scaled_ldl_prs 
      1.835839

Now let’s look at a meta analysis:

Code

# Create a data frame to store the results
results <- data.frame(
  Population = character(),
  HR = numeric(),
  Lower = numeric(),
  Upper = numeric(),
  stringsAsFactors = FALSE
)

#Extract the results from each model
for (pop in names(cox_models)) {
  model_summary <- summary(cox_models[[pop]])
  hr <- model_summary$coefficients["scaled_ldl_prs", "exp(coef)"]
  lower <- model_summary$conf.int["scaled_ldl_prs", "lower .95"]
  upper <- model_summary$conf.int["scaled_ldl_prs", "upper .95"]
  
  # Add to the results data frame
  results <-
    rbind(results,
          data.frame(
            Population = pop,
            HR = hr,
            Lower = lower,
            Upper = upper
          ))
}

ggplot(results,
       aes(
         x = Population,
         y = HR,
         ymin = Lower,
         ymax = Upper,
         color = Population
       )) +
  geom_point() + # Add points for hazard ratios
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) + # Add error bars
  coord_flip() + # Flip coordinates for a horizontal plot
  xlab("Population") +
  ylab("Hazard Ratio (HR)") +
  ggtitle("HR for LDL-C_40 on CAD across Populations:MGB") +
  theme_minimal()

And now with OR/GLM:

Code

lapply(glm_models, function(x) {
  exp(coef(x)["scaled_ldl_prs"])
})

$AFR
scaled_ldl_prs 
     0.9896046 

$SAS
scaled_ldl_prs 
       16.9214 

$EUR
scaled_ldl_prs 
      1.151231 

$AMR
scaled_ldl_prs 
      1.299968 

$EAS
scaled_ldl_prs 
      1.871408

Code

#
results <- data.frame(
  Population = character(),
  OR = numeric(),
  Lower = numeric(),
  Upper = numeric(),
  stringsAsFactors = FALSE
)

for (pop in names(cox_models)) {
  model_summary <-
    tidy(glm_models[[pop]], conf.int = TRUE, exponentiate = TRUE)
  
  if ("scaled_ldl_prs" %in% model_summary$term) {
    term_row <- model_summary %>% filter(term == "scaled_ldl_prs")
    
    results <- rbind(
      results,
      data.frame(
        Population = pop,
        OR = term_row$estimate,
        Lower = term_row$conf.low,
        Upper = term_row$conf.high
      )
    )
  } else {
    results <-
      rbind(results,
            data.frame(
              Population = pop,
              OR = NA,
              Lower = NA,
              Upper = NA
            ))
  }
}
# # Create the forest plot using ggplot2
#
ggplot(results,
       aes(
         x = Population,
         y = OR,
         ymin = Lower,
         ymax = Upper,
         color = Population
       )) +
  geom_point() +
  geom_errorbar(aes(ymin = Lower, ymax = Upper), width = 0.1) +
  coord_flip() +
  xlab("Population") +
  ylab("Odds Ratio (OR)") +
  ggtitle("OR for LDL-C PRS_40 on CAD:MGB")  + theme_minimal()

Bayesian Meta-analysis

Now we want to take a two-step Bayesian approach where you first calculate a global mean hazard ratio (HR) or odds ratio (OR) from population-specific estimates and then use tAMR global mean as a prior in a Bayesian adjustment of the population-specific estimates. In such a case, weighting the global mean by the standard error (or variance) of each population-specific estimate is indeed a good approach, as it accounts for the different levels of uncertainty in each population estimate.

Calculate Weighted Global Mean HR/OR:
- For each population, you have an HR/OR and its standard error (SE).
- You can weight the HR/OR by the inverse of the variance (which is SE²).
- The weighted mean HR/OR is then the sum of each weighted HR/OR divided by the sum of the weights.
Use Weighted Mean as Prior for Bayesian Adjustment:
- Use the weighted global mean HR/OR as the mean for your prior distribution in the Bayesian model for each population.
- The variance of the prior could be based on the pooled standard errors or a measure of the variability of the HR/OR across populations.

Variance and Weights

The variance of each log-transformed odds ratio (OR) can be approximated from the confidence interval:

Variance \approx {(\frac{\log (Upper CI) - \log (Lower CI)}{2 \times 1.96})}^{2}

Based on tAMR variance, the weight for each observation is:

$w_{i} = \frac{1}{{Variance}_{i}}$

Weighted Mean The weighted mean is then calculated as:

$Weighted Mean = \frac{\sum_{i = 1}^{n} w_{i} x_{i}}{\sum_{i = 1}^{n} w_{i}}$

where $(x_{i})$ is the log(OR) for each observation $i$ .

Weighted Variance The weighted variance is given by:

$Weighted Variance = \frac{\sum w_{i}}{{(\sum w_{i})}^{2} - \sum w_{i}^{2}} \sum w_{i} (x_{i} - Weighted Mean)^{2}$

The weighted variance is given by:

$Weighted Variance = \frac{\sum_{i = 1}^{n} w_{i}}{{(\sum_{i = 1}^{n} w_{i})}^{2} - \sum_{i = 1}^{n} w_{i}^{2}} \sum_{i = 1}^{n} w_{i} (x_{i} - Weighted Mean)^{2}$ Here,

$\sum_{i = 1}^{n} w_{i}$

is the sum of the weights,

$\sum_{i = 1}^{n} w_{i}^{2}$ is the sum of the squared weights, and $x_{i}$ is each individual log-transformed OR.

Code

# Assuming you have a dataframe 'results' with columns 'Population', 'OR', and 'SE'

# Calculate variance and weights
results <- results %>%
  mutate(
    log_OR = log(OR),
    log_Lower = log(Lower),
    log_Upper = log(Upper),
    Variance = ((log_Upper - log_Lower) / (2 * 1.96))^2,
    Weight = 1 / Variance
  )


# Calculate weighted mean and variance for the prior
sum_weights <- sum(results$Weight)
sum_squared_weights <- sum(results$Weight^2)

weighted_mean_log_OR <- sum(results$log_OR * results$Weight) / sum_weights
weighted_variance <- (sum_weights / (sum_weights^2 - sum_squared_weights)) * 
                     sum(results$Weight * (results$log_OR - weighted_mean_log_OR)^2)

prior_mean <- weighted_mean_log_OR
prior_variance <- weighted_variance

# Perform Bayesian updating for each population
results <- results %>%
  mutate(
    Posterior_Variance = 1 / (1 / prior_variance + 1 / Variance),
    Posterior_Mean = Posterior_Variance * (prior_mean / prior_variance + log_OR / Variance)
  )

# Exponentiating the posterior mean to get back to the OR scale
results$Posterior_OR <- exp(results$Posterior_Mean)

results <- results %>%
  mutate(
    Posterior_Lower_Log = Posterior_Mean - 1.96 * sqrt(Posterior_Variance),
    Posterior_Upper_Log = Posterior_Mean + 1.96 * sqrt(Posterior_Variance),
    Posterior_Lower_OR = exp(Posterior_Lower_Log),
    Posterior_Upper_OR = exp(Posterior_Upper_Log)
  )
# Now 'results' contains the posterior mean and variance for each population
print(results)

  Population         OR     Lower     Upper      log_OR   log_Lower log_Upper
1        AFR  0.9896046 0.7516421  1.304601 -0.01044983 -0.28549503 0.2658973
2        SAS 16.9214014 3.6238440 85.004439  2.82857917  1.28753534 4.4427035
3        EUR  1.1512314 1.0409548  1.273449  0.14083217  0.04013836 0.2417288
4        AMR  1.2999682 0.9814195  1.725766  0.26233978 -0.01875530 0.5456708
5        EAS  1.8714081 0.6206754  6.188360  0.62669116 -0.47694709 1.8226701
     Variance     Weight Posterior_Variance Posterior_Mean Posterior_OR
1 0.019785604  50.541798        0.015803786     0.02157385     1.021808
2 0.647847638   1.543573        0.070039034     0.43840074     1.550226
3 0.002644647 378.122284        0.002558484     0.14108771     1.151526
4 0.020732038  48.234525        0.016401857     0.23859936     1.269470
5 0.344143016   2.905769        0.063938830     0.23748665     1.268058
  Posterior_Lower_Log Posterior_Upper_Log Posterior_Lower_OR Posterior_Upper_OR
1         -0.22482385           0.2679715          0.7986569           1.307310
2         -0.08031108           0.9571126          0.9228292           2.604166
3          0.04194805           0.2402274          1.0428403           1.271538
4         -0.01241732           0.4896160          0.9876595           1.631690
5         -0.25812147           0.7330948          0.7725014           2.081512

Forest Plot

We can then make the Bayesian meta analysis

Code

library(dplyr)
library(ggplot2)
library(forestplot)

Loading required package: grid

Loading required package: checkmate

Loading required package: abind

Code

library(forestplot)
library(ggplot2)
library(dplyr)

# Assuming 'results' is already loaded with your data

# We need to create a new data frame for ggplot
# First, we create two separate data frames for original and posterior results and then bind them together
# plot_data <- results %>%
#   select(Population, OR, Lower, Upper, Posterior_OR, Posterior_Lower_OR, Posterior_Upper_OR) %>%
#   gather(key = "Type", value = "Value", -Population, -Lower, -Upper, -Posterior_Lower_OR, -Posterior_Upper_OR) %>%
#   mutate(
#     ymin = ifelse(Type == "OR", Lower, Posterior_Lower_OR),
#     ymax = ifelse(Type == "OR", Upper, Posterior_Upper_OR),
#     Type = factor(Type, levels = c("OR", "Posterior_OR"))
#   )
# 
# # Now we plot with ggplot2
# ggplot(plot_data, aes(y = Population, x = Value, xmin = ymin, xmax = ymax, color = Type)) +
#   geom_errorbarh(height = 0.2) +
#   geom_point(size = 2.5) +
#   scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
#   theme_minimal() + lims(x=c(0.5,2.5))+
#   labs(x = "Odds Ratio", y = "", color = "Estimate Type") +
#   theme(legend.position = "bottom")



# Adding an identifier for each row
results$ID <- seq_along(results$Population)

results$ID <- seq_along(results$Population)

# Create a long-format data frame for plotting
plot_data <- tidyr::pivot_longer(
  results, 
  cols = c("OR", "Posterior_OR"), 
  names_to = "Estimate_Type", 
  values_to = "Value"
) %>%
  mutate(
    ymin = ifelse(Estimate_Type == "OR", Lower, Posterior_Lower_OR),
    ymax = ifelse(Estimate_Type == "OR", Upper, Posterior_Upper_OR),
    Estimate_Type = factor(Estimate_Type, levels = c("OR", "Posterior_OR")),
    # Adjust the y position of the posterior estimates
    ypos = as.numeric(ID) + ifelse(Estimate_Type == "OR", -0.1, 0.1)
  )

# Create the plot
ggplot(plot_data, aes(x = Value, y = ypos, color = Estimate_Type)) +
  geom_point(size = 3) +
  geom_errorbar(aes(xmin = ymin, xmax = ymax), width = 0.2) +
  scale_color_manual(values = c("OR" = "black", "Posterior_OR" = "red")) +
  labs(x = "Odds Ratio", y = "") +
  theme_minimal() +
  theme(axis.text.y = element_text(vjust = 0.5), 
        axis.ticks.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        legend.position = "bottom") +
  scale_y_continuous(breaks = 1:length(unique(plot_data$ID)), labels = results$Population)

We can do tAMR in stan and calculate heterogneiety as well:

Code

library(brms)

results <- results %>%
mutate(se_log_OR = 1 / sqrt(Weight))

# Given prior study's log OR and its standard error
prior_log_or <- log(1/0.76)  # log of the hazard ratio from the study
prior_sd <- (log(1/0.64) - log(1/0.91)) / (2 * 1.96)  # standard error of the log OR


# or try tAMR:
#   
# rosuvastatin_with_event <- 235
# rosuvastatin_without_event <- 6361 - rosuvastatin_with_event
# 
# placebo_with_event <- 304
# placebo_without_event <- 6344 - placebo_with_event
# 
# # Compute the OR
# odds_case <- rosuvastatin_with_event / rosuvastatin_without_event
# odds_control <- placebo_with_event / placebo_without_event
# OR <- odds_control / odds_case
# 
# # Convert OR to log scale for the prior
# prior_log_or <- log(OR)
# 
# # Calculate standard error of the log OR
# prior_log_or_se <- sqrt(1 / placebo_with_event +
#                         1 / placebo_without_event +
#                         1 / rosuvastatin_with_event +
#                         1 / rosuvastatin_without_event)
# 
# # Print the prior log OR and its standard error
# print(prior_log_or)
# print(prior_log_or_se)
# 
# # Print the OR
# print(OR)
# # Define the priors based on the prior study for the global effect size
#prior_effect <- set_prior(paste("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")

prior_effect <- set_prior(paste("normal(", weighted_mean_log_OR, ", ", sqrt(weighted_variance), ")"), class = "Intercept")

# Define a non-informative prior for the heterogeneity
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd", group = "Population")

prior_global <- set_prior(paste0("normal(", prior_log_or, ", ", prior_log_or_se, ")"), class = "Intercept")
prior_heterogeneity <- set_prior("student_t(3, 0, 10)", class = "sd",group = "Population")

# Fit the model
meta_analysis_model <- brm(
  bf(log_OR | se(se_log_OR) ~ 1 + (1 | Population)),
  data = results,
  prior = c(prior_global, prior_heterogeneity),
  family = gaussian(),
  chains = 4,
  iter = 4000,
  warmup = 2000,
  control = list(adapt_delta = 0.999999) # adjust for convergence
)

# Check the summary of the model
summary(meta_analysis_model)

# Extract population-specific effects
ranef(meta_analysis_model)

global_effect= fixef(meta_analysis_model)[1,1]

# Get the random effects (population-specific deviations)
population_effects <- ranef(meta_analysis_model)

# Calculate the population-specific log ORs by adding the global effect to the random effects
population_specific_log_ORs <- global_effect + population_effects$Population[,,"Intercept"][,"Estimate"]

# Convert the log ORs to ORs
population_specific_ORs <- exp(population_specific_log_ORs)

# Now you have the population-specific ORs
print(population_specific_ORs)

Heterogeneity assessment:

Code

# Extract the standard deviation of the random effects
heterogeneity_estimate <- VarCorr(meta_analysis_model)$Population$sd

# Print the heterogeneity estimate
print(heterogeneity_estimate)

# Conduct posterior predictive checks
pp_check(meta_analysis_model)

# If you want to plot the distribution of the standard deviation of the random effects
posterior_samples <- posterior_samples(meta_analysis_model, pars = "sd_Population__Intercept")
AMRt(posterior_samples[, "sd_Population__Intercept"], breaks = 40, main = "Distribution of Heterogeneity (sd Population)",x="Heterogeneity (SD)",fill="blue")

# Summarize the posterior distribution of the heterogeneity parameter
posterior_summary <- summary(posterior_samples[, "sd_Population__Intercept"])
print(posterior_summary)