Pick 2000 SNPs from 2 chromosomes as our small dataset
# How many SNPs from each chromosome you want to include in your dataset
SNPS = 2000
# Get all snps you want to include to include in your dataset
startrow = 1
rowidx = c()
newmap = c()
for (i in c(1:2)){
newmap = rbind(newmap, hapmap$map[(startrow:(startrow+(SNPS-1))),])
rowidx = cbind(rowidx, startrow:(startrow+SNPS-1))
startrow = startrow + nrow(hapmap$map %>% filter(chromosome==i))
}
# Update hapmap to only include 2*2000 snps
hapmap <- read.plink(bed, bim, fam, select.snps = rowidx)
hapmap
# confirm that we have 4000 SNPs and 165 people
hapmap$genotypes
## A SnpMatrix with 165 rows and 4000 columns
## Row names: NA06989 ... NA12865
## Col names: rs2185539 ... rs2694088
Remove the monomorphic SNPs in this subset
#get the index of SNPs whose MAF is zero to exclude monomorphic SNPs
mono <- which(col.summary(hapmap$genotype)$MAF == 0)
#nomonosnps contain snps that are not monomorphic
nomonosnps = hapmap$genotypes[,-mono]
#nomonosnps is a SnpMatrix, let's convert it to a matrix for later use
X <- as(nomonosnps, "numeric")
#check how many non-monomorphic SNPs do we have, and starting from here, we will use these non-monomorphic SNPs only
ncol(X)
## [1] 3582