# Here is a function to calculate z values based on p-values and odds_ratiocalculate_zvalue =function(p_value,odds_ratio){# Creat a new vector to store all Z-scores of the two-tailed z-testZSCORE =c()# Find the z-score for each SNP for (i in1:length(p_value)) {# If Odd Ratio > 1, z-score is the integrated area above the p-valueif(odds_ratio[i]>1){ ZSCORE[i] =qnorm(p_value[i]/2,0,1,lower.tail=FALSE) }# Otherwise, z-score is the integrated area below the p-valueelse{ ZSCORE[i] =qnorm(p_value[i]/2,0,1,lower.tail=TRUE) } }# Append z-score to our datasetreturn(ZSCORE)}
#Apply the method above to calculate z-values based on p-values and odds ratiosz =calculate_zvalue(p,or)# Make sure that we have one z-value for each non-monomorphic SNPlength(z)
## [1] 3582
# Let's exclude the 4 causal SNPs here, before that exclude the monomorphic SNPs first# exclude monomorphic SNPsuseful_snps = hapmap$map[-mono,]# build a dataframe with snp.name, chromosome, position, p-values, odds_ratios, and z-values, then remove the causal SNPs by their row indexdf =data.frame(snp.name = useful_snps$snp.name, chromosome = useful_snps$chromosome, position = useful_snps$position, pvalue = p, odds_ratio = or, z_value = z) %>%filter(!row_number() %in% causal_idx) # preview the dataframe# head(df)
# a simple function that checks if a number in list k is within the +/- n range to any number in a given list, return T/F onlynearby =function(k, list, n =100000){ near_list =c()for(i in k){ near = F for (j in list){ if( abs ( i - j ) <= n ){ near = T } } near_list =c(near_list, near)}return(near_list)}
# Here, we categorize the SNPs within 100000 units of position to the causal SNPs as nearby SNPs# Add one more categorical variable column to our previous dataframe, indicating if the SNP is a "nearby SNP" or notnearby_status =c()for( i in (1:2)){# get lists of physical positions of causal SNPs on the ith chromosome causal_positions = (causal_map %>%filter(chromosome==i))$position# get the nearby status of all available SNPs on the ith chromosome, determined by the distance between causal SNPs' positions with theirs nearby_status =c(nearby_status, nearby((df %>%filter(chromosome==i))$position,causal_positions, n=100000))}df$nearby_status = nearby_status#preview the dataframe againhead(df)
6.2 Run the PLIS procedure
#Run the PLIS procedure to find the SNPs that are most associated with the disease statuslibrary(PLIS)SNP =c()LIS =c()for (i in1:2){#get the snps on chromosome i chr_sample=df[which(df[,"chromosome"]==i),]#make sure these snps are correctly ordered by their physical locations chr_sample <- chr_sample[order(chr_sample[,"position"]),]#calculate z values chr_sample$z =calculate_zvalue(chr_sample$pvalue, chr_sample$odds_ratio)#apply EM algorithm chr_sample.L2rlts=em.hmm(chr_sample$z, L =2)#append new results SNP =rbind(SNP, chr_sample) LIS =c(LIS, chr_sample.L2rlts$LIS)print(paste("---Finished analyzing chromosome",i, "---"))}
# plis controls the global FDR at 0.01 here for the pooled hypotheses from different groupsplis.rlts=plis(LIS,fdr=0.001)# gFDR stands for global FDR, denotes the corresponding global FDR if using the LIS statistic as the significance cutoffall.Rlts=cbind(SNP, LIS, gFDR=plis.rlts$aLIS, fdr001state=plis.rlts$States)
6.3 Get the most important 10 SNPs by ranking LIS
# show the 10 SNPs on both chromosomes with the lowest LIS# i.e. the top 10 SNPs determine by the PLIS procedureall.Rlts[order(all.Rlts[,"LIS"])[1:10],]