Step 3 Generate a sub-dataset

3.1 Pick 2000 SNPs from 2 chromosomes as our small dataset

# How many SNPs from each chromosome you want to include in your dataset
SNPS = 2000

# Get all snps you want to include to include in your dataset
startrow = 1
rowidx = c()
newmap = c()
for (i  in c(1:2)){
  newmap = rbind(newmap, hapmap$map[(startrow:(startrow+(SNPS-1))),])
  rowidx = cbind(rowidx, startrow:(startrow+SNPS-1))
  startrow = startrow + nrow(hapmap$map %>% filter(chromosome==i))
}

# Update hapmap to only include 2*2000 snps
hapmap <- read.plink(bed, bim, fam, select.snps = rowidx)
hapmap

# confirm that we have 4000 SNPs and 165 people
hapmap$genotypes

## A SnpMatrix with  165 rows and  4000 columns
## Row names:  NA06989 ... NA12865 
## Col names:  rs2185539 ... rs2694088

3.2 Remove the monomorphic SNPs in this subset

#get the index of SNPs whose MAF is zero to exclude monomorphic SNPs
mono <- which(col.summary(hapmap$genotype)$MAF == 0)

#nomonosnps contain snps that are not monomorphic
nomonosnps = hapmap$genotypes[,-mono]

#nomonosnps is a SnpMatrix, let's convert it to a matrix for later use
X <- as(nomonosnps, "numeric")

#check how many non-monomorphic SNPs do we have, and starting from here, we will use these non-monomorphic SNPs only
ncol(X)

## [1] 3582