Step 5 Conventional p-value procedure
Independent variable: this SNP’s minor allele count for everyone
Dependent variable: disease status for everyone
5.1 Conduct logistics regression on each SNP
# save odds ratio&p-values for later use
# Note: or is abbrev. for odds_ratio
= c()
p = c()
or for (col in (1:ncol(X))){
#short for SNP minor allele count
= as.vector(X[,col])
SNPmac
#convert status into a vector
= as.vector(status)
statusv
#fit a logistic regression model
<- glm(statusv ~ SNPmac, family = 'binomial')
model.glm
#beta0 is the intercept of the logistic regression
= summary(model.glm)$coefficients[1,1]
beta0
#beta1 is the coeff of increment of 1 minor allele's effect on the log(p/1-p) = log(odds_ratio)
= summary(model.glm)$coefficients[2,1]
beta1
= cbind(or,exp(beta1))
or = cbind(p, summary(model.glm)$coefficients[2,4])
p
}= as.vector(or)
or = as.vector(p) p
# Remember how many non-monomorphic SNPs are we using right now? Make sure we have an odds ratio and a p-value for each of them
# (i.e. The number of odds ratio and p-value should match the number of non-monomorphic SNPs)
length(or)
## [1] 3582
length(p)
## [1] 3582
# Great! Let's first check the estimated effect of our 4 causal SNPs
## Recall that the we set the 10th, 11th, 2000th, 3000th SNPs as causal SNPs
## Recall that we set their odds_ratio (effect size) to be log(1.5)
## Let's see if the logistics regressions can detect that
or[causal_idx]
## [1] 303531777 84118113 67853034 92488570
# OMG! Really big odds ratios. Let's check other SNPs' odds ratios. Are they much smaller than the odds ratios of causal SNPs?
1000] or[
## [1] 3.778327
1500] or[
## [1] 1.652893
3500] or[
## [1] 0.9224015
# exclude monomorphic SNPs
= hapmap$map[-mono,]
useful_snps
# build a dataframe with snp.name, chromosome, position, p-values, then remove the causal SNPs by their row index
= data.frame(snp.name = useful_snps$snp.name, chromosome = useful_snps$chromosome, position = useful_snps$position, pvalue = p, odds_ratio = or, z_value = z) %>% filter(!row_number() %in% causal_idx)
df
# preview the dataframe
head(df)