Setting up
# Here is a function to calculate z values based on p-values and odds_ratio
calculate_zvalue = function(p_value,odds_ratio){
# Creat a new vector to store all Z-scores of the two-tailed z-test
ZSCORE = c()
# Find the z-score for each SNP
for (i in 1:length(p_value)) {
# If Odd Ratio > 1, z-score is the integrated area above the p-value
if(odds_ratio[i]>1){
ZSCORE[i] = qnorm(p_value[i]/2,0,1,lower.tail=FALSE)
}
# Otherwise, z-score is the integrated area below the p-value
else{
ZSCORE[i] = qnorm(p_value[i]/2,0,1,lower.tail=TRUE)
}
}
# Append z-score to our dataset
return(ZSCORE)
}
#Apply the method above to calculate z-values based on p-values and odds ratios
z = calculate_zvalue(p,or)
# Make sure that we have one z-value for each non-monomorphic SNP
length(z)
## [1] 3582
# Let's exclude the 4 causal SNPs here, before that exclude the monomorphic SNPs first
# exclude monomorphic SNPs
useful_snps = hapmap$map[-mono,]
# build a dataframe with snp.name, chromosome, position, p-values, odds_ratios, and z-values, then remove the causal SNPs by their row index
df = data.frame(snp.name = useful_snps$snp.name, chromosome = useful_snps$chromosome, position = useful_snps$position, pvalue = p, odds_ratio = or, z_value = z) %>% filter(!row_number() %in% causal_idx)
# preview the dataframe
# head(df)
# a simple function that checks if a number in list k is within the +/- n range to any number in a given list, return T/F only
nearby = function(k, list, n = 100000){
near_list = c()
for(i in k){ near = F
for (j in list){ if( abs ( i - j ) <= n ){ near = T } }
near_list = c(near_list, near)}
return(near_list)
}
# Here, we categorize the SNPs within 100000 units of position to the causal SNPs as nearby SNPs
# Add one more categorical variable column to our previous dataframe, indicating if the SNP is a "nearby SNP" or not
nearby_status = c()
for( i in (1:2)){
# get lists of physical positions of causal SNPs on the ith chromosome
causal_positions = (causal_map %>% filter(chromosome==i))$position
# get the nearby status of all available SNPs on the ith chromosome, determined by the distance between causal SNPs' positions with theirs
nearby_status = c(nearby_status, nearby((df %>% filter(chromosome==i))$position,causal_positions, n=100000))
}
df$nearby_status = nearby_status
#preview the dataframe again
head(df)
Run the PLIS procedure
#Run the PLIS procedure to find the SNPs that are most associated with the disease status
library(PLIS)
SNP = c()
LIS = c()
for (i in 1:2){
#get the snps on chromosome i
chr_sample=df[which(df[,"chromosome"]==i),]
#make sure these snps are correctly ordered by their physical locations
chr_sample <- chr_sample[order(chr_sample[,"position"]),]
#calculate z values
chr_sample$z = calculate_zvalue(chr_sample$pvalue, chr_sample$odds_ratio)
#apply EM algorithm
chr_sample.L2rlts=em.hmm(chr_sample$z, L = 2)
#append new results
SNP = rbind(SNP, chr_sample)
LIS = c(LIS, chr_sample.L2rlts$LIS)
print(paste("---Finished analyzing chromosome",i, "---"))
}
## [1] "---Finished analyzing chromosome 1 ---"
## [1] "---Finished analyzing chromosome 2 ---"
# plis controls the global FDR at 0.01 here for the pooled hypotheses from different groups
plis.rlts=plis(LIS,fdr=0.001)
# gFDR stands for global FDR, denotes the corresponding global FDR if using the LIS statistic as the significance cutoff
all.Rlts=cbind(SNP, LIS, gFDR=plis.rlts$aLIS, fdr001state=plis.rlts$States)