Chapter 8 GWAS
8.1 GWAS analysis
In this exercise, we are conducting a GWAS for four different models: 1) Naive (T-test), 2) GLM accounting for population structure (Q), 3) MLM accounting for population structure (Q) and Kinship (K), 4) FarmCPU accounting for population structure (Q) and Kinship (K). Each model is testing the same hypothesis: Ho: No association between SNP and trait. Ha: There is an association between SNP and trait.
The Naive model is fit using base R functions and GLM, MLM and FarmCPU models are run using the rMVP R package (Yin et al. 2021).
Additional packages used in this section are data.table (Dowle and Srinivasan 2021), and tidyverse (Wickham 2021).
Load necessary packages
library(rMVP)
library(data.table)
library(tidyverse)
Set working directory to where rMVP files are in workshop materials
setwd("~/PGRP_mapping_workshop/data/genotypic_and_phenotypic_data/")
Read in phenotypes
= fread("WIDIV_942.phe", data.table = FALSE) phenos
Read in SNPs
= attach.big.matrix("WIDIV_942.geno.desc") myGD
Read in SNPs map
= fread("WIDIV_942.geno.map", data.table = FALSE) map
Read in Kinship matrix (A matrix)
= attach.big.matrix("WIDIV_942.kin.desc") myKin
Read in principle components (Q)
= attach.big.matrix("WIDIV_942.pc.desc")
snp_pcs = snp_pcs[,] snp_pcs
Run a naive model without any correction for kinship or population structure
### Make a dataframe with phenotypes and genotypes together
= cbind(phenos,t(myGD[,]))
GD_naive
### Fit a simple linear regression model for low heritability simulated trait regressed on SNPs and save those results for later
= sapply(4:ncol(GD_naive), function(x) anova(lm(GD_naive$Color_low_h2 ~ GD_naive[,x]))$'Pr(>F)'[1])
low_h2_naive = cbind(map,low_h2_naive)
low_h2_naive colnames(low_h2_naive)[6] = "Color_low_h2.Naive"
### Fit a simple linear regression model for high heritability simulated trait regressed on SNPs
= sapply(4:ncol(GD_naive), function(x) anova(lm(GD_naive$Color_high_h2 ~ GD_naive[,x]))$'Pr(>F)'[1])
high_h2_naive = cbind(map,high_h2_naive)
high_h2_naive colnames(high_h2_naive)[6] = "Color_high_h2.Naive"
### Save those results for visualizing later with IGV
setwd("~/PGRP_mapping_workshop/gwas_results/for_IGV_visualization/")
= low_h2_naive[,c(2,3,1,6)]
low_h2_naive_IGV colnames(low_h2_naive_IGV) = c("CHR","BP","SNP","P")
= high_h2_naive[,c(2,3,1,6)]
high_h2_naive_IGV colnames(high_h2_naive_IGV) = c("CHR","BP","SNP","P")
fwrite(low_h2_naive_IGV, file = "Color_low_h2_naive.gwas",col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
fwrite(high_h2_naive_IGV, file = "Color_high_h2_naive.gwas",col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
## Save results for simple static plot visualization later
setwd("~/PGRP_mapping_workshop/gwas_results/")
fwrite(low_h2_naive, file = "Color_low_h2_naive.txt",col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
fwrite(high_h2_naive, file = "Color_high_h2_naive.txt",col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
Run GLM with (Q), MLM with (Q) and (K), and FarmCPU with (Q) and (K) and save results for IGV visualization Note to visualize gwas results in IGV, you need a .gwas file which has four columns, CHR, SNP, BP, and P. The loop below writes the .gwas file for each model fit to the high heritability trait and the low heritability trait.
setwd("~/PGRP_mapping_workshop/gwas_results/for_IGV_visualization/")
for(i in colnames(phenos)[2:ncol(phenos)]){
= MVP(
gwas_kinship phe=phenos[, c("Taxa", i)], # Phenotype dataframe
geno=myGD, # Genotypic data as a big.matrix
map=map, # Map dataframe
K=myKin, # Kinship used in MLM
maxLoop=10, # Max number of iterations for FarmCPU
CV.GLM=snp_pcs[,1:3], # Covariates (Q) in GLM
CV.MLM=snp_pcs[,1:3], # Covariates (Q) in MLM
CV.FarmCPU=snp_pcs[,1:3], # Covariates (Q) in FarmCPU
method.bin="FaST-LMM", # Method for optimizing bins used by FarmCPU
method=c("GLM", "MLM", "FarmCPU"), # GWAS models being fit
file.output = FALSE # If set to TRUE, will generate many plots and files for each trait
)### These lines are simply for reformatting and remove columns which you would normally want to keep for real data such as effect sizes and standard error associated with each test.
= cbind(gwas_kinship$map,gwas_kinship$glm.results)
GLM = GLM[,c(2,3,1,8)]
GLM colnames(GLM) = c("CHR","BP","SNP","P")
fwrite(GLM, file = paste0(i,"_GLM_Q.gwas"),col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
= cbind(gwas_kinship$map,gwas_kinship$mlm.results)
MLM = MLM[,c(2,3,1,8)]
MLM colnames(MLM) = c("CHR","BP","SNP","P")
fwrite(MLM, file = paste0(i,"_MLM_QandK.gwas"),col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
= cbind(gwas_kinship$map,gwas_kinship$farmcpu.results)
CPU = CPU[,c(2,3,1,8)]
CPU colnames(CPU) = c("CHR","BP","SNP","P")
fwrite(CPU, file = paste0(i,"_FarmCPU_QandK.gwas"),col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
gc()
}
Run the same models, but saving output to a text file for static visualization later
setwd("~/PGRP_mapping_workshop/gwas_results/")
for(i in colnames(phenos)[2:ncol(phenos)]){
= MVP(
gwas_kinship phe=phenos[, c("Taxa", i)],
geno=myGD,
map=map,
K=myKin,
CV.GLM=snp_pcs[,1:3],
CV.MLM=snp_pcs[,1:3],
CV.FarmCPU=snp_pcs[,1:3],
maxLoop=10,
method.bin="FaST-LMM",
method=c("GLM", "MLM", "FarmCPU"),
file.output = FALSE
)= cbind(gwas_kinship$map,gwas_kinship$glm.results)
GLM = GLM[,-c(6:7)]
GLM colnames(GLM)[6] = paste0(colnames(GLM)[6],".Q")
fwrite(GLM, file = paste0(i,"_GLM_Q.txt"),col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
= cbind(gwas_kinship$map,gwas_kinship$mlm.results)
MLM = MLM[,-c(6:7)]
MLM colnames(MLM)[6] = paste0(colnames(MLM)[6],".QandK")
fwrite(MLM, file = paste0(i,"_MLM_QandK.txt"),col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
= cbind(gwas_kinship$map,gwas_kinship$farmcpu.results)
CPU = CPU[,-c(6:7)]
CPU colnames(CPU)[6] = paste0(colnames(CPU)[6],".QandK")
fwrite(CPU, file = paste0(i,"_FarmCPU_QandK.txt"),col.names = TRUE, sep = "\t", quote = FALSE, na = "NA")
gc()
}
8.2 Static plot visualization
Now that the analysis has been run, lets visualize the results in a static manhattan plot. For this we will use the CMplot R package (R-CMplot?). These plots are saved in the directory set below.
Load necessary packages
library(CMplot)
Read in the results which were saved to individual text files
setwd("~/PGRP_mapping_workshop/gwas_results/")
= list.files() # Lists the files in the directory
gwas_results_files = gwas_results_files[!gwas_results_files%in%c("for_IGV_visualization")] # Remove the "for_IGV_visualisation"
gwas_results_files
= list() # Make a list for the dataframes to be read into
gwas_results
## This loop reads in the results into a list. Each element of the list is a dataframe with the results for the model fit
for (i in gwas_results_files){
= fread(i)
gwas_results[[i]] }
Combine the results from a list into a single dataframe
= gwas_results %>%
all.results.pvals reduce(left_join, by = c("SNP","CHROM","POS","REF","ALT"))
= data.frame(all.results.pvals) all.results.pvals
Set the working directory to where the figures will be saved
setwd("~/PGRP_mapping_workshop/figures/gwas/")
Set the bonferroni correction at alpha = 0.05
= 0.05/nrow(all.results.pvals) bonf
Plot for high heritability trait models
= all.results.pvals[,c("SNP","CHROM","POS",
high_models "Color_high_h2.Naive","Color_high_h2.GLM.Q",
"Color_high_h2.MLM.QandK","Color_high_h2.FarmCPU.QandK")]
= CMplot(high_models,
plot_high_models band = 1.2,
cex.axis = 2,
plot.type=c("m","q"),
multracks = TRUE,
cex = 0.4,
threshold=bonf,
threshold.col="black",
threshold.lwd=1.25,
threshold.lty=2,
signal.cex=1.25,
LOG10=TRUE,
box=FALSE,
width=25,
## Other options
chr.den.col=c("darkgreen", "yellow", "red"),
cex.lab= 2,
file.output=TRUE,
verbose=TRUE,
memo = "HIGH_models")
From top to bottom, manhattan plots are for a Naive model (T-test), GLM model accounting for (Q), MLM model accounting for (Q) and (K), FarmCPU model accounting for (Q) and (K).
Plot for low heritability trait models
= all.results.pvals[,c("SNP","CHROM","POS",
low_models "Color_low_h2.Naive","Color_low_h2.GLM.Q",
"Color_low_h2.MLM.QandK","Color_low_h2.FarmCPU.QandK")]
= CMplot(low_models,
plot_low_models band = 1.2,
cex.axis = 2,
plot.type=c("m","q"),
multracks = TRUE,
cex = 0.9,
threshold=bonf,
threshold.col="black",
threshold.lwd=1.25,
threshold.lty=2,
signal.cex=1.25,
LOG10=TRUE,
box=FALSE,
width=25,
## Other options
chr.den.col=c("darkgreen", "yellow", "red"),
cex.lab= 2,
file.output=TRUE,
verbose=TRUE,
memo = "LOW_models")
From top to bottom, manhattan plots are for a Naive model (T-test), GLM model accounting for (Q), MLM model accounting for (Q) and (K), FarmCPU model accounting for (Q) and (K).
As a reminder, these are the QTLs we simulated.