Counting the sgRNAs with RSubread

To count the sgRNAs, we must first load the human GeCKO sgRNA library, and subsequently, convert it into the FASTA format to construct an index for alignment. To do that we use DNAStringSet and writeXStringSet functions from GenomicAlignments Bioconductor package.

GeCKO <- read.delim("sgRNA_library/GeCKOv21_Human.tsv")
GeCKO[1:2, ]
  gene_id          UID                  seq
1    A1BG HGLibA_00001 GTCGCTGAGCTCCGATTCGA
2    A1BG HGLibA_00002 ACCTGTAGTTGCCGGCGTGC
sgRNAs <- DNAStringSet(GeCKO$seq)
names(sgRNAs) <- GeCKO$UID
sgRNAs
DNAStringSet object of length 122411:
         width seq                                        names               
     [1]    20 GTCGCTGAGCTCCGATTCGA                       HGLibA_00001
     [2]    20 ACCTGTAGTTGCCGGCGTGC                       HGLibA_00002
     [3]    20 CGTCAGCGTCACATTGGCCA                       HGLibA_00003
     [4]    20 CGCGCACTGGTCCAGCGCAC                       HGLibA_00004
     [5]    20 CCAAGCTATATCCTGTGCGC                       HGLibA_00005
     ...   ... ...
[122407]    20 GGTGATGCCCTACCCGATGC                       HGLibB_57024
[122408]    20 GAGGGCCTCCAACATGTTCT                       HGLibB_57025
[122409]    20 GTTCTTCAACAGTCCACAAC                       HGLibB_57026
[122410]    20 CGTCGAGATTCTACTTCTTC                       HGLibB_57027
[122411]    20 CTGAAACATTTAACCAGTTG                       HGLibB_57028
writeXStringSet(sgRNAs, file = "./index/GeCKO.fa")

Build an index

Then we build an index of sgRNA library with buildindex function from RSubread package.

buildindex("./index/GeCKO", "./index/GeCKO.fa", indexSplit = FALSE)

Align the FASTQ files

Next step is to align the sequencing files with align function from RSubread and count the reads with readGAlignments function in GenomicAlignments.

The subsequent phase involves aligning the sequencing files (FASTQ files) using the align function from RSubread and counting the reads for each sgRNA using the readGAlignments function in GenomicAlignments.

fastqs <- dir(path = "./fastq_files", pattern = "*.fastq.gz", full.names = TRUE)
counts <- list()
mapping_results <- list()
for (i in 1:length(fastqs)) {
    mapping_results[[i]] <- align("./index/GeCKO", fastqs[i], output_file = gsub(".fastq.gz",
        ".bam", fastqs[i]), nthreads = 4, unique = TRUE, nBestLocations = 1, type = "DNA",
        TH1 = 1, maxMismatches = 0, indels = 0)
    temp <- readGAlignments(gsub(".fastq.gz", ".bam", fastqs[i]))
    counts[[i]] <- data.frame(table(seqnames(temp[width(temp) == "20"])), row.names = "Var1")
}
my_counts <- do.call(cbind, counts)
colnames(my_counts) <- c("Control_1", "Control_2", "ToxA_1", "ToxA_2", "ToxB_1",
    "ToxB_2")
write.table(my_counts, "my_counts.txt")

Create a DGEList object

For downstream analysis, here we are going to convert count matrix obtained in the previous section into a DGEList object using the DGEList function from edgeR package. The DGEList object consists of three components: counts, information about samples and gene annotations.

counts <- read.table("my_counts.txt", header = TRUE)
group <- factor(c("Control", "Control", "ToxA", "ToxA", "ToxB", "ToxB"), levels = c("Control",
    "ToxA", "ToxB"))
samples <- data.frame(group = group, sampleName = colnames(counts), biorep = rep(c(1,
    2), 3))
genes <- GeCKO
names(genes)[names(genes) == "gene_id"] <- "Symbol"
d <- DGEList(counts = counts, samples = samples, genes = genes)
d
An object of class "DGEList"
$counts
             Control_1 Control_2 ToxA_1 ToxA_2 ToxB_1 ToxB_2
HGLibA_00001         0         7      3      2      2      8
HGLibA_00002         0         0      0      0      0      0
HGLibA_00003         1         0      0      0      0      0
HGLibA_00004         9         0      0      0      0      1
HGLibA_00005         1         8      1      0      1      1
122406 more rows ...

$samples
            group lib.size norm.factors sampleName biorep
Control_1 Control   338214            1  Control_1      1
Control_2 Control   337711            1  Control_2      2
ToxA_1       ToxA   344188            1     ToxA_1      1
ToxA_2       ToxA   335848            1     ToxA_2      2
ToxB_1       ToxB   356569            1     ToxB_1      1
ToxB_2       ToxB   355131            1     ToxB_2      2

$genes
             Symbol          UID                  seq
HGLibA_00001   A1BG HGLibA_00001 GTCGCTGAGCTCCGATTCGA
HGLibA_00002   A1BG HGLibA_00002 ACCTGTAGTTGCCGGCGTGC
HGLibA_00003   A1BG HGLibA_00003 CGTCAGCGTCACATTGGCCA
HGLibA_00004   A1CF HGLibA_00004 CGCGCACTGGTCCAGCGCAC
HGLibA_00005   A1CF HGLibA_00005 CCAAGCTATATCCTGTGCGC
122406 more rows ...
save(d, file = "./RData/DGEList.RData")