Computing PRS

Author

Sarah Urbut

Published

March 7, 2024

PRS Computation

This script details how to compute multiethnic scores using the weights that are available in /medpop/esp2/SarahUrbut/ldpred2_score/MultipleAncestry-Weight/LDL.MultiAncectry.

It was created 3/5/2024 by Sarah Urbut with much help from MS Kim!

Extracted Bgen

recall that extracted bgen variants and the summaries were computed first using scripts below in /medpop/esp2/mskim/For_other/Sarah/

Step 1:
- run 1_my_pvar.sh and 2_combine_pvar_chr1to22.sh placed in /medpop/esp2/mskim/For_other/Sarah/

Harmonizing Variants

Now, we will need to generate summary statistics that are harmonaized with available variants for use in effect scoring

Code

library(data.table)

library(dplyr)

poplist = c("AFR", "EUR", "EAS", "SAS", "HISP")

for (i in 1:length(poplist)) {
  pop = poplist[i]
  
  print(pop)
  
  PGSsumstat = fread("/medpop/esp2/SarahUrbut/ldpred2_score/LDL-SAS-GLGC.ldpred2.beta.grid.tsv")
  
  
  
  PGSsumstat = fread(
    paste0(
      "/medpop/esp2/SarahUrbut/ldpred2_score/MultipleAncestry-Weight/LDL.MultiAncectry.",
      pop,
      ".txt.gz"
    )
  )
  
  
  
  #This is how UKB Genotype looks like (pvar).
  
  ukb_genotype <-
    fread(
      "/medpop/esp2/mskim/For_other/Sarah/pvar/combined.pvar.pvar",
      header = T,
      stringsAsFactors = F
    )
  
  ukb_genotype <- ukb_genotype %>% rename(CHROM = `#CHROM`)
  
  PGSsumstat <- PGSsumstat %>% rename(variantID = `sortedID`)
  
  #now we are ready for testing match rate between PGS sum stat and ukb genotype stored in
  
  #UKB pvar>
  
  ukb_genotype$variantID <-
    paste(ukb_genotype$CHROM,
          ukb_genotype$POS,
          ukb_genotype$ALT,
          ukb_genotype$REF,
          sep = ":")
  
  #let's test matching and further try flip flop to maximize matching
  
  # See how many snps are currently matched
  
  PGSsumstat_matched_w_UKBgenotype <-
    PGSsumstat[PGSsumstat$variantID %in% ukb_genotype$variantID,] #530202 out of 1121671 matched
  
  #Step 2: Swap Last Two Alphabets for unmatched Rows
  
  # Function to swap the last two characters in the variantID
  
  swap_last_two <- function(variantID) {
    parts <- unlist(strsplit(variantID, split = ":"))
    
    if (length(parts) >= 4) {
      parts[c(4, 3)] <- parts[c(3, 4)]  # Swap the last two elements
      
      return(paste(parts, collapse = ":"))
      
    } else {
      return(variantID)
      
    }
    
  }
  
  # Apply the function to swap alt/ref allele in snp_name for non-matching rows
  
  PGSsumstat$revised_variantID <-
    ifelse(
      !(PGSsumstat$variantID %in% ukb_genotype$variantID),
      
      sapply(PGSsumstat$variantID, swap_last_two),
      
      PGSsumstat$variantID
    )
  
  #now, compare again the match rate of variants in sum stat and genotpye data
  
  chr1_22_matched <-
    PGSsumstat[PGSsumstat$revised_variantID %in% ukb_genotype$variantID,] # 1061190 mathced
  
  print(nrow(chr1_22_matched))
  
  PGSsumstat$variantID <- NULL
  
  PGSsumstat <- PGSsumstat %>% rename(variantID = `revised_variantID`)
  
  # Performing a left join to add rsID from ukb_genotype to PGSsumstat
  
  PGSsumstat1 <- PGSsumstat %>%
    
    left_join(ukb_genotype[, c("variantID", "ID")], by = "variantID")
  
  PGSsumstat2 <- na.omit(PGSsumstat1, cols = "ID")
  
  PGSsumstat_FINAL <- PGSsumstat2 %>%
    
    select(
      rsid = ID,
      a1 = ALT,
      a0 = REF,
      Beta = 'Weight'
    ) #PLINK will interpret a1 as effect allele automatically otherwise specified. You need to confirm! whether a1 is actually the effect allele (you must ask to group how made this PGSsumstat). If a0 is effect allele, you need to switch a0 and a1.
  
  
  
  
  
  write.table(
    PGSsumstat_FINAL,
    paste0(
      "/medpop/esp2/mskim/For_other/Sarah/",
      pop,
      "FINAL_mixed.tsv"
    ),
    col.names = T,
    row.names = F,
    quote = F,
    sep = "\t"
  )
  
}

Compute PRS

Now that we have our harmonized sum stats, we can go back go to bash and run each of the PRS commands through plink.

you will note that we have generate *_my_PRS_runs.sh for each ancestry using script in /medpop/esp2/SarahUrbut/ldpred2_score/automatingprsscript.R and changing the {{ to { and }} to }
each script is run using qsub /medpop/esp2/mskim/For_other/Sarah/***_my_PRS_run.sh Remember you cannot run from an interactive node

Combining Scores

Now we will need to read these files together to create the score
Instead of calling 1-22 chr individually, we can use loop here over all populations and chromosomes

Code

for (p in 1:length(poplist)) {
  pop = poplist[p]
  
  file_list <- list()
  
  
  
  for (i in 1:22) {
    file_path <-
      
      paste0(
        "/medpop/esp2/mskim/For_other/Sarah/pscore/",
        
        pop,
        
        "_mult.chr",
        
        i,
        
        ".sscore",
        
        sep = ""
        
      )
    
    
    
    # Read the file into file_data
    
    file_data <-
      
      fread(file_path,
            
            header = TRUE,
            
            stringsAsFactors = FALSE)
    
    
    
    # Retain only the 2nd and 5th columns
    
    file_list[[i]] <- file_data[, c(2, 5)]
    
  }
  
  #left_join all chr1-22 scores
  
  result_df <- file_list[[1]]  # start with the first data frame
  
  for (i in 2:length(file_list)) {
    result_df <- left_join(result_df, file_list[[i]], by = "IID")
    
  }
  
  #sum all scores in column 2-23 and remove 22 columns (maintain only sum score)
  
  result_df$sum_score <-
    rowSums(result_df[, 2:23], na.rm = TRUE) #First, calculate the row-wise sum for columns 2 to 23
  
  result_df <- result_df[, c(1, 24)]
  
  write.table(
    result_df,
    paste0(
      file = "/medpop/esp2/mskim/For_other/Sarah/",
      pop,
      "_SUM_SCORE_FINAL"),
      sep = "\t",
      row.names = FALSE,
      quote = FALSE
    )
  
  
}