Step 5 Conventional p-value procedure

Independent variable: this SNP’s minor allele count for everyone

Dependent variable: disease status for everyone

5.1 Conduct logistics regression on each SNP

# save odds ratio&p-values for later use
# Note: or is abbrev. for odds_ratio
p = c()
or = c()
for (col in (1:ncol(X))){
  
  #short for SNP minor allele count
  SNPmac = as.vector(X[,col])
  
  #convert status into a vector
  statusv = as.vector(status)
  
  #fit a logistic regression model
  model.glm <- glm(statusv ~ SNPmac, family = 'binomial')
  
  #beta0 is the intercept of the logistic regression
  beta0 = summary(model.glm)$coefficients[1,1]
  
  #beta1 is the coeff of increment of 1 minor allele's effect on the log(p/1-p) = log(odds_ratio)
  beta1 = summary(model.glm)$coefficients[2,1]
  
  or = cbind(or,exp(beta1))
  p = cbind(p, summary(model.glm)$coefficients[2,4])
  
}
or = as.vector(or)
p = as.vector(p)

# Remember how many non-monomorphic SNPs are we using right now? Make sure we have an odds ratio and a p-value for each of them 
# (i.e. The number of odds ratio and p-value should match the number of non-monomorphic SNPs)

length(or)

## [1] 3582

length(p)

## [1] 3582

# Great! Let's first check the estimated effect of our 4 causal SNPs
## Recall that the we set the 10th, 11th, 2000th, 3000th SNPs as causal SNPs
## Recall that we set their odds_ratio (effect size) to be log(1.5)
## Let's see if the logistics regressions can detect that
or[causal_idx]

## [1] 303531777  84118113  67853034  92488570

# OMG! Really big odds ratios. Let's check other SNPs' odds ratios. Are they much smaller than the odds ratios of causal SNPs?
or[1000]

## [1] 3.778327

or[1500]

## [1] 1.652893

or[3500]

## [1] 0.9224015

# exclude monomorphic SNPs
useful_snps = hapmap$map[-mono,]

# build a dataframe with snp.name, chromosome, position, p-values, then remove the causal SNPs by their row index
df = data.frame(snp.name = useful_snps$snp.name, chromosome = useful_snps$chromosome, position = useful_snps$position, pvalue = p, odds_ratio = or, z_value = z) %>% filter(!row_number() %in% causal_idx) 

# preview the dataframe
head(df)

5.2 Get the most important 10 SNPs by ranking their p-values

df[order(df[,"pvalue"])[1:10],]