# Differential abundance analysis at guide level

There is classic edgeR and glm edgeR. ## Create the design matrix{-} For further statistical modeling using `edgeR`, we now need to define a design matrix.

``````design <- model.matrix(~group, data = yy.norm\$samples)
design``````
``````          (Intercept) groupToxA groupToxB
Control_2           1         0         0
Control_1           1         0         0
ToxA_2              1         1         0
ToxA_1              1         1         0
ToxB_2              1         0         1
ToxB_1              1         0         1
attr(,"assign")
[1] 0 1 1
attr(,"contrasts")
attr(,"contrasts")\$group
[1] "contr.treatment"``````

## 0.2 Classic edgeR

### 0.2.1 Fit a model when there is no biological replicates

While it’s generally advisable to include biological replicates in experimental designs for statistical rigor and robust conclusions, there may be situations in CRISPR screens where researchers choose not to include them initially, especially in exploratory phases. The decision often involves a trade-off between the depth of analysis and the available resources. However, as the study progresses and findings are prioritized, additional experiments with biological replicates are often conducted to validate and strengthen the results. In instances where biological replicates are unavailable, they are not ideal but classic `edgeR` provides two alternative solutions: one involves the `exactTest`, and the other employs the Generalized Linear Model Likelihood Ratio Test (`glmLRT`). In such scenarios, a crucial step is to determine a sensible estimate for common Biological Coefficient of Variation (BCV). However, the p-values and the number differentially abundant guides will depend on the bcv. Subsequently, this estimate is utilized as follows:

``````bcv <- 0.6
yy.norm.no_biorep <- DGEList(counts = yy.norm\$counts[, c(2, 4, 6)],
samples = yy.norm\$samples[c(2, 4, 6), ], genes = yy.norm\$genes)
et <- exactTest(yy.norm.no_biorep, dispersion = bcv^2, pair = c(1,
3))
``````Comparison of groups:  ToxB-Control
Symbol          UID                  seq
HGLibA_56745      ZNF784 HGLibA_56745 GCAAGCTGTAGTGCGCGCGC
HGLibA_62457 hsa-mir-590 HGLibA_62457 TTATTCATAAAAGTGCAGTA
HGLibA_04839       BRPF1 HGLibA_04839 TACTGCTGTGGCCCACGCCG
HGLibA_39135     PSMC3IP HGLibA_39135 AATCGTGGCCCTCACTGCTA
HGLibA_39953       RAB44 HGLibA_39953 CAAGATGACCAGCCGCCTGC
HGLibB_51676       TRPC5 HGLibB_51676 CTCCCGACTGAACATCTATA
logFC   logCPM       PValue          FDR
HGLibA_56745 16.40706 13.54169 4.396582e-10 2.458876e-05
HGLibA_62457 15.52052 12.65606 2.421539e-09 6.771472e-05
HGLibA_04839 14.37376 11.51312 2.197430e-08 2.349355e-04
HGLibA_39135 14.36467 11.51097 2.237347e-08 2.349355e-04
HGLibA_39953 14.35794 11.49787 2.266119e-08 2.349355e-04
HGLibB_51676 11.58312 12.08571 2.749944e-08 2.349355e-04``````
``plotMD(et)``

``````fit <- glmFit(yy.norm.no_biorep, dispersion = bcv^2)
lrt <- glmLRT(fit, coef = "y\$samples\$groupToxA")
topTags(lrt)``````
``````Coefficient:  y\$samples\$groupToxA
Symbol          UID                  seq    logFC
HGLibB_38324  PRDM13 HGLibB_38324 GCAAGTACCTGTCAGACCGC 13.79029
HGLibB_04039 B4GALT7 HGLibB_04039 GCGAGGACGACGAGTTCTAC 13.55125
HGLibA_20923   HBEGF HGLibA_20923 GCCCTCTCCGAAGCCGCTCC 13.38190
HGLibB_44908 SLC35B2 HGLibB_44908 TCCGCCTGAAGTACTGCACC 10.90573
HGLibB_42762  SBSPON HGLibB_42762 GACAAGCTACGTCTCCACAC 12.68573
HGLibB_20894   HBEGF HGLibB_20894 TCTTGAACTAGCTGCCACCC 10.36379
HGLibB_23505  IQGAP3 HGLibB_23505 CACTCACAGGCTGCCCAGCC 12.37695
HGLibA_07161  CAPN15 HGLibA_07161 CATGTCGTCCACCAGCACCG 12.08184
HGLibA_56457  ZNF649 HGLibA_56457 ACTTAAGCTGTGACTTGCTG 11.97552
HGLibA_55349 ZKSCAN2 HGLibA_55349 ACCAGTGAAAGATGTCCACG 11.95656
logCPM       LR       PValue          FDR
HGLibB_38324 10.887298 32.72439 1.061960e-08 0.0004436210
HGLibB_04039 10.649647 31.80684 1.702923e-08 0.0004436210
HGLibA_20923 10.640050 31.15712 2.379643e-08 0.0004436210
HGLibB_44908 11.410148 30.37388 3.562930e-08 0.0004981600
HGLibB_42762  9.912502 28.48975 9.419562e-08 0.0009711574
HGLibB_20894 11.041123 28.29456 1.041884e-07 0.0009711574
HGLibB_23505  9.517303 27.30909 1.733939e-07 0.0013853431
HGLibA_07161  9.336969 26.18251 3.106240e-07 0.0020335084
HGLibA_56457  9.198546 25.77718 3.831925e-07 0.0020335084
HGLibA_55349  9.262887 25.70492 3.978110e-07 0.0020335084``````
``````plotMD(lrt, main = "ToxA vs Control")
abline(h = c(-4, 4), col = "blue")``````

``````# go <- goana(lrt) remove GO.db The blue lines indicate 2-fold
# changes.``````
``v <- voom(yy.norm, design, plot = TRUE)``

``````vfit <- lmFit(v, design)
efit <- eBayes(vfit)
plotSA(efit)``````

``summary(decideTests(efit))``
``````       (Intercept) groupToxA groupToxB
Down             0     22187     23861
NotSig        7965     27921     29501
Up           47962      5819      2565``````
``````tfit <- treat(vfit, lfc = 2)
dt <- decideTests(tfit)
summary(dt)``````
``````       (Intercept) groupToxA groupToxB
Down             0      6329      6416
NotSig       38603     47485     48871
Up           17324      2113       640``````
``````toxa.vs.ctrl <- topTreat(tfit, coef = "groupToxA", n = Inf)
toxb.vs.ctrl <- topTreat(tfit, coef = "groupToxB", n = Inf)
``````              Symbol          UID                  seq    logFC
HGLibA_32966   NUDT2 HGLibA_32966 ATGAGCACCAAGCCTACCGC 9.356170
HGLibA_36228     PGC HGLibA_36228 ACGACTCGCTGGGGTTGAAG 8.215204
HGLibB_44452 SLC19A3 HGLibB_44452 CTGAATCACCAAGGCAATAA 8.319624
HGLibA_28771   MED20 HGLibA_28771 AACAGACATGATGCGGTCTA 7.952625
HGLibA_28209    MAP4 HGLibA_28209 CATACCGTAACTGTCTTTCA 8.233813
HGLibA_32966 5.682929 36.13809 4.663422e-07 0.00296124
HGLibA_36228 4.800247 33.54411 6.359671e-07 0.00296124
HGLibA_38909 4.665411 33.61408 6.369277e-07 0.00296124
HGLibB_44452 4.095367 33.24597 6.643360e-07 0.00296124
HGLibA_28771 3.444822 31.84156 8.013316e-07 0.00296124
HGLibA_28209 3.538396 31.62572 8.329187e-07 0.00296124``````
``dim(toxa.vs.ctrl)``
``[1] 55927     8``
``dim(toxa.vs.ctrl[toxa.vs.ctrl\$adj.P.Val <= 0.05, ])``
``[1] 8442    8``
``head(toxb.vs.ctrl)``
``````             Symbol          UID                  seq    logFC
HGLibA_11002 CPSF3L HGLibA_11002 CTTTGGACACTTCGAGTGGC 9.248411
HGLibA_42364   RSU1 HGLibA_42364 TTCTTCAGTTCTGCGATGTT 9.248074
HGLibA_07730 CCDC22 HGLibA_07730 TCCAGAGCCACGGGAGTTCC 8.921697
HGLibA_52066  TTC19 HGLibA_52066 GTTCTGCGCAGCATAGATAC 8.853388
HGLibB_35573   PCLO HGLibB_35573 TGCTGAAGCGGATCCCTACC 8.282755
HGLibA_39824  RAB18 HGLibA_39824 GACATAGTAAACATGCTAGT 8.197197
HGLibA_11002 3.898377 40.11934 2.882188e-07 0.005210632
HGLibA_42364 3.898265 39.88013 2.962161e-07 0.005210632
HGLibA_07730 3.789472 38.23900 3.559404e-07 0.005210632
HGLibA_52066 3.766703 37.84200 3.726738e-07 0.005210632
HGLibB_35573 3.576501 34.45187 5.638743e-07 0.005877772
HGLibA_39824 3.547961 33.60296 6.305832e-07 0.005877772``````

## Biological variation

The vertical axis of the plot shows square-root dispersion, also known as biological coefficient of variation (BCV). For RNA-seq studies, the NB dispersions tend to be higher for genes with very low counts. The dispersion trend tends to decrease smoothly with abundance and to asymptotic to a constant value for genes with larger counts. From our past experience, the asymptotic value for the BCV tends to be in range from 0.05 to 0.2 for genetically identical mice or cell lines, whereas somewhat larger values (> 0.3) are observed for human subjects. The NB model can be extended with quasi-likelihood (QL) methods to account for gene-specific variability from both biological and technical sources7,12. Under the QL framework, the NB dispersion trend is used to describe the overall biological variability across all genes, and gene-specific variability above and below the overall level is picked up by the QL dispersion. In the QL approach, the individual (tagwise) NB dispersions are not used.

``````yy.norm <- estimateDisp(yy.norm, design)
yy.norm\$common.dispersion``````
``[1] 0.6331858``
``plotBCV(yy.norm)``

The square root of dispersion is the coefficient of biological variation (BCV). The common BCV is on the high side, considering that this is a designed experiment using genetically identical plants. The trended dispersion shows a decreasing trend with expression level. At low logCPM, the dispersions are very large indeed.Note that only the trended dispersion is used under the quasi-likelihood (QL) pipeline. The tagwise and common estimates are shown here but will not be used further. The QL dispersions can be estimated using the glmQLFit function, and then be visualized with the plotQLDisp function.

## glmLRT edgeR

``````fit <- glmFit(yy.norm, design)
lrt <- glmLRT(fit, coef = "groupToxA")
topToxAvsCtrl.all <- topTags(lrt, n = Inf, sort.by = "PValue")\$table
topToxAvsCtrl <- topTags(lrt, n = Inf, sort.by = "PValue")\$table
dim(topToxAvsCtrl <- topToxAvsCtrl[topToxAvsCtrl\$FDR < 0.05, ])``````
``[1] 24709     8``
``length(intersect(toxa.vs.ctrl\$Symbol, topToxAvsCtrl\$Symbol))``
``[1] 14953``
``vennDiagram(dt[, 1:2], circle.col = c("turquoise", "salmon"))``

``````topToxAvsCtrl\$sgRNA <- rownames(topToxAvsCtrl)
topToxAvsCtrl\$Gene_Regulation[topToxAvsCtrl\$logFC >= 0] <- "Enriched in ToxA"
topToxAvsCtrl\$Gene_Regulation[topToxAvsCtrl\$logFC < 0] <- "Depleted in ToxA"
])``````
``````              Symbol          UID                  seq     logFC
HGLibB_38324  PRDM13 HGLibB_38324 GCAAGTACCTGTCAGACCGC 11.209176
HGLibA_32966   NUDT2 HGLibA_32966 ATGAGCACCAAGCCTACCGC 11.594388
HGLibB_42762  SBSPON HGLibB_42762 GACAAGCTACGTCTCCACAC 12.255728
HGLibB_44908 SLC35B2 HGLibB_44908 TCCGCCTGAAGTACTGCACC 10.286574
HGLibB_45454   SLIT2 HGLibB_45454 TCTCTAGTTCTTTAAGATCC 11.404868
HGLibB_23505  IQGAP3 HGLibB_23505 CACTCACAGGCTGCCCAGCC  9.781491
logCPM       LR       PValue          FDR
HGLibB_38324 10.829302 89.05910 3.831913e-21 2.143074e-16
HGLibA_32966  8.780164 73.01061 1.289546e-17 3.195504e-13
HGLibB_42762  9.530267 72.44893 1.714112e-17 3.195504e-13
HGLibB_44908 11.311403 69.70313 6.893661e-17 9.638544e-13
HGLibB_45454  8.763229 68.20288 1.475103e-16 1.649962e-12
HGLibB_23505  9.434533 66.97834 2.745064e-16 2.491273e-12
sgRNA  Gene_Regulation
HGLibB_38324 HGLibB_38324 Enriched in ToxA
HGLibA_32966 HGLibA_32966 Enriched in ToxA
HGLibB_42762 HGLibB_42762 Enriched in ToxA
HGLibB_44908 HGLibB_44908 Enriched in ToxA
HGLibB_45454 HGLibB_45454 Enriched in ToxA
HGLibB_23505 HGLibB_23505 Enriched in ToxA``````
``````datatable(format(topToxAvsCtrl <- topToxAvsCtrl[order(topToxAvsCtrl\$FDR,
decreasing = FALSE), c(-2, -3)], format = "e", digits = 3))``````