Step 6 PLIS procedure

6.1 Setting up

# Here is a function to calculate z values based on p-values and odds_ratio
calculate_zvalue = function(p_value,odds_ratio){
# Creat a new vector to store all Z-scores of the two-tailed z-test
ZSCORE = c()
# Find the z-score for each SNP 
  for (i in 1:length(p_value)) {
    # If Odd Ratio > 1, z-score is the integrated area above the p-value
    if(odds_ratio[i]>1){
      ZSCORE[i] = qnorm(p_value[i]/2,0,1,lower.tail=FALSE)
    }
    # Otherwise, z-score is the integrated area below the p-value
    else{
      ZSCORE[i] = qnorm(p_value[i]/2,0,1,lower.tail=TRUE)
    }
  }
# Append z-score to our dataset
return(ZSCORE)
}
#Apply the method above to calculate z-values based on p-values and odds ratios
z = calculate_zvalue(p,or)

# Make sure that we have one z-value for each non-monomorphic SNP
length(z)
## [1] 3582
# Let's exclude the 4 causal SNPs here, before that exclude the monomorphic SNPs first

# exclude monomorphic SNPs
useful_snps = hapmap$map[-mono,]

# build a dataframe with snp.name, chromosome, position, p-values, odds_ratios, and z-values, then remove the causal SNPs by their row index
df = data.frame(snp.name = useful_snps$snp.name, chromosome = useful_snps$chromosome, position = useful_snps$position, pvalue = p, odds_ratio = or, z_value = z) %>% filter(!row_number() %in% causal_idx) 

# preview the dataframe
# head(df)
# a simple function that checks if a number in list k is within the +/- n range to any number in a given list, return T/F only
nearby = function(k, list, n = 100000){
  near_list = c()
  for(i in k){ near = F 
    for (j in list){ if( abs ( i - j ) <= n ){ near = T } }
    near_list = c(near_list, near)}
  return(near_list)
}
# Here, we categorize the SNPs within 100000 units of position to the causal SNPs as nearby SNPs
# Add one more categorical variable column to our previous dataframe, indicating if the SNP is a "nearby SNP" or not

nearby_status = c()
for( i in (1:2)){
  
  # get lists of physical positions of causal SNPs on the ith chromosome
  causal_positions = (causal_map %>% filter(chromosome==i))$position
  
  # get the nearby status of all available SNPs on the ith chromosome, determined by the distance between causal SNPs' positions with theirs
  nearby_status = c(nearby_status, nearby((df %>% filter(chromosome==i))$position,causal_positions, n=100000))
}

df$nearby_status = nearby_status

#preview the dataframe again
head(df)

6.2 Run the PLIS procedure

#Run the PLIS procedure to find the SNPs that are most associated with the disease status

library(PLIS)
SNP = c()
LIS = c()
for (i in 1:2){
  #get the snps on chromosome i
  chr_sample=df[which(df[,"chromosome"]==i),]

  #make sure these snps are correctly ordered by their physical locations
  chr_sample <- chr_sample[order(chr_sample[,"position"]),]
  
  #calculate z values
  chr_sample$z = calculate_zvalue(chr_sample$pvalue, chr_sample$odds_ratio)
  
  #apply EM algorithm
  chr_sample.L2rlts=em.hmm(chr_sample$z, L = 2)
  
  #append new results
  SNP = rbind(SNP, chr_sample)
  LIS = c(LIS, chr_sample.L2rlts$LIS)
  print(paste("---Finished analyzing chromosome",i, "---"))
}
## [1] "---Finished analyzing chromosome 1 ---"
## [1] "---Finished analyzing chromosome 2 ---"
# plis controls the global FDR at 0.01 here for the pooled hypotheses from different groups
plis.rlts=plis(LIS,fdr=0.001)

# gFDR stands for global FDR, denotes the corresponding global FDR if using the LIS statistic as the significance cutoff
all.Rlts=cbind(SNP, LIS, gFDR=plis.rlts$aLIS, fdr001state=plis.rlts$States)

6.3 Get the most important 10 SNPs by ranking LIS

# show the 10 SNPs on both chromosomes with the lowest LIS
# i.e. the top 10 SNPs determine by the PLIS procedure
all.Rlts[order(all.Rlts[,"LIS"])[1:10],]