MAGeCKFlute provides three different methods for pathway enrichment analysis, including hypergeometric test (HGT), over-representation test (ORT), and gene set enrichment analysis (GSE). MAGeCKFlute also includes several functions for visualizing enrichment results. This vignette introduces the functions related to pathway enrichment analysis.
knitr::opts_chunk$set(tidy=FALSE, cache=TRUE,
dev="png", message=FALSE, error=FALSE, warning=TRUE)
Citation: if you use MAGeCKFlute in published research, please cite: Binbin Wang, Mei Wang, Wubing Zhang. “Integrative analysis of pooled CRISPR genetic screens using MAGeCKFlute.” Nature Protocols (2019), doi: 10.1038/s41596-018-0113-7.
if(!"MAGeCKFlute" %in% installed.packages()) BiocManager::install("MAGeCKFlute")
if(!"ggplot2" %in% installed.packages()) BiocManager::install("ggplot2")
library(MAGeCKFlute)
library(ggplot2)
MAGeCKFlute requires a weighted gene list for enrichment analysis.
file1 = file.path(system.file("extdata", package = "MAGeCKFlute"),
"testdata/rra.gene_summary.txt")
gdata = ReadRRA(file1)
genelist = gdata$Score
names(genelist) = gdata$id
genelist = sort(genelist, decreasing = TRUE)
head(genelist)
## Jak1 Stat1 Ifngr2 Ifngr1 B2m H2-D1
## 10.7440 9.0330 5.6097 5.4719 4.3433 4.2110
# Alternative functions EnrichAnalyzer and enrich.HGT.
hgtRes1 = EnrichAnalyzer(genelist[1:200], method = "HGT",
type = "Pathway", organism = "mmu")
head(hgtRes1@result)
## ID Description
## BIOCARTA_IFNG_PATHWAY BIOCARTA_IFNG_PATHWAY IFNG PATHWAY
## REACTOME_877312 REACTOME_877312 Regulation of IFNG signaling
## BIOCARTA_IL22BP_PATHWAY BIOCARTA_IL22BP_PATHWAY IL22BP PATHWAY
## BIOCARTA_IFNA_PATHWAY BIOCARTA_IFNA_PATHWAY IFNA PATHWAY
## BIOCARTA_TID_PATHWAY BIOCARTA_TID_PATHWAY TID PATHWAY
## REACTOME_877300 REACTOME_877300 Interferon gamma signaling
## NES pvalue p.adjust GeneRatio BgRatio
## BIOCARTA_IFNG_PATHWAY 18.307949 9.401935e-11 5.048839e-08 5/107 6/9264
## REACTOME_877312 9.714240 2.440253e-05 4.081607e-03 3/107 9/9264
## BIOCARTA_IL22BP_PATHWAY 15.317503 2.440253e-05 4.081607e-03 3/107 9/9264
## BIOCARTA_IFNA_PATHWAY 13.883283 3.040303e-05 4.081607e-03 4/107 18/9264
## BIOCARTA_TID_PATHWAY 9.455281 4.054116e-05 4.354120e-03 4/107 19/9264
## REACTOME_877300 9.714240 9.109707e-05 6.444888e-03 3/107 12/9264
## geneID
## BIOCARTA_IFNG_PATHWAY 16451/16452/20846/15979/15980
## REACTOME_877312 16452/15979/15980
## BIOCARTA_IL22BP_PATHWAY 16451/16452/20846
## BIOCARTA_IFNA_PATHWAY 16451/15975/15976/20846
## BIOCARTA_TID_PATHWAY 16452/22059/15979/15980
## REACTOME_877300 16452/15979/15980
## geneName Count
## BIOCARTA_IFNG_PATHWAY Jak1/Jak2/Stat1/Ifngr1/Ifngr2 5
## REACTOME_877312 Jak2/Ifngr1/Ifngr2 3
## BIOCARTA_IL22BP_PATHWAY Jak1/Jak2/Stat1 3
## BIOCARTA_IFNA_PATHWAY Jak1/Ifnar1/Ifnar2/Stat1 4
## BIOCARTA_TID_PATHWAY Jak2/Trp53/Ifngr1/Ifngr2 4
## REACTOME_877300 Jak2/Ifngr1/Ifngr2 3
# hgtRes2 = enrich.HGT(genelist[1:200])
# head(hgtRes2@result)
The ORT and GSEA are implemented in the R package clusterProfiler (Yu et al. 2012). So please install it prior to the analysis.
if(!"clusterProfiler" %in% installed.packages()){
BiocManager::install("clusterProfiler")
}
library(clusterProfiler)
# Alternative functions EnrichAnalyzer and enrich.ORT.
ortRes1 = EnrichAnalyzer(genelist[1:200], method = "ORT",
type = "KEGG", organism = "mmu")
head(ortRes1@result)
## ID
## KEGG_mmu05140 KEGG_mmu05140
## KEGG_mmu05235 KEGG_mmu05235
## KEGG_mmu04612 KEGG_mmu04612
## KEGG_mmu05340 KEGG_mmu05340
## KEGG_mmu05212 KEGG_mmu05212
## KEGG_mmu04658 KEGG_mmu04658
## Description NES
## KEGG_mmu05140 Leishmaniasis 17.491523
## KEGG_mmu05235 PD-L1 expression and PD-1 checkpoint pathway in cancer 17.795521
## KEGG_mmu04612 Antigen processing and presentation 9.038750
## KEGG_mmu05340 Primary immunodeficiency 5.717129
## KEGG_mmu05212 Pancreatic cancer 13.069760
## KEGG_mmu04658 Th1 and Th2 cell differentiation 18.307949
## pvalue p.adjust GeneRatio BgRatio
## KEGG_mmu05140 1.530265e-05 0.001254818 7/54 70/4536
## KEGG_mmu05235 5.495082e-04 0.016338254 6/54 88/4536
## KEGG_mmu04612 6.199206e-04 0.016338254 6/54 90/4536
## KEGG_mmu05340 7.969880e-04 0.016338254 4/54 36/4536
## KEGG_mmu05212 1.927944e-03 0.031618287 5/54 76/4536
## KEGG_mmu04658 3.670334e-03 0.044654889 5/54 88/4536
## geneID
## KEGG_mmu05140 16451/20846/15980/15979/16452/17357/16412
## KEGG_mmu05235 16451/20846/15980/15979/16452/13645
## KEGG_mmu04612 12010/14964/21354/21355/19727/53970
## KEGG_mmu05340 21354/21355/19727/53970
## KEGG_mmu05212 16451/20846/12575/13645/22059
## KEGG_mmu04658 16451/20846/15980/15979/16452
## geneName Count
## KEGG_mmu05140 Jak1/Stat1/Ifngr2/Ifngr1/Jak2/Marcksl1/Itgb1 7
## KEGG_mmu05235 Jak1/Stat1/Ifngr2/Ifngr1/Jak2/Egf 6
## KEGG_mmu04612 B2m/H2-D1/Tap1/Tap2/Rfxank/Rfx5 6
## KEGG_mmu05340 Tap1/Tap2/Rfxank/Rfx5 4
## KEGG_mmu05212 Jak1/Stat1/Cdkn1a/Egf/Trp53 5
## KEGG_mmu04658 Jak1/Stat1/Ifngr2/Ifngr1/Jak2 5
# ortRes2 = enrich.ORT(genelist[genelist< -1])
# head(ortRes2@result)
library(clusterProfiler)
# Alternative functions EnrichAnalyzer and enrich.GSE.
gseRes1 = EnrichAnalyzer(genelist, method = "GSEA", type = "Pathway", organism = "mmu")
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize, gseaParam, : There are ties in the preranked stats (8.16% of the list).
## The order of those tied genes will be arbitrary, which may produce unexpected results.
## Warning in fgseaMultilevel(pathways = pathways, stats = stats, minSize =
## minSize, : For some pathways, in reality P-values are less than 1e-10. You can
## set the `eps` argument to zero for better estimation.
require(ggplot2)
df = hgtRes1@result
df$logFDR = -log10(df$p.adjust)
p = BarView(df[1:5,], "Description", 'logFDR')
p = p + labs(x = NULL) + coord_flip()
p
# Or use function barplot from enrichplot package
barplot(hgtRes1, showCategory = 5)
## top: up-regulated pathways;
## bottom: down-regulated pathways
EnrichedView(hgtRes1, top = 5, bottom = 0, mode = 1)
EnrichedView(hgtRes1, top = 5, bottom = 0, mode = 2)
dotplot(hgtRes1, showCategory = 5)
library(enrichplot)
hgtRes1@result$geneID = hgtRes1@result$geneName
cnetplot(hgtRes1, 5)
heatplot(hgtRes1, showCategory = 5, foldChange = genelist)
tmp <- pairwise_termsim(hgtRes1)
emapplot(tmp, showCategory = 5, layout = "kk")
## Warning in emapplot.enrichResult(x, showCategory = showCategory, ...): Use 'layout.params = list(layout = your_value)' instead of 'layout'.
## The layout parameter will be removed in the next version.
# show GSEA results of one pathway
idx = which(gseRes1$NES>0)[1]
gseaplot(gseRes1, geneSetID = idx, title = gseRes1$Description[idx])
# show GSEA results of multiple pathways
gseaplot2(gseRes1, geneSetID = which(gseRes1$NES>0)[1:3])
For enrichment analysis, MAGeCKFlute signifies the public available gene sets, including Pathways (PID, KEGG, REACTOME, BIOCARTA, C2CP), GO terms (GOBP, GOCC, GOMF), Complexes (CORUM) and molecular signature from MsigDB (c1, c2, c3, c4, c5, c6, c7, HALLMARK).
Analysis of high-throughput data increasingly relies on pathway annotation and functional information derived from Gene Ontology, which is also useful in the analysis of CRISPR screens.
## combination of the gene sets
enrichComb = EnrichAnalyzer(genelist[1:200], type = "KEGG+REACTOME", organism = "mmu")
EnrichedView(enrichComb, top = 5)
## All pathways
enrich = EnrichAnalyzer(geneList = genelist[1:200], type = "REACTOME", organism = "mmu")
EnrichedView(enrich, top = 5)
## All gene ontology
enrichGo = EnrichAnalyzer(genelist[1:200], type = "GOBP", organism = "mmu")
Functional annotations from the pathways and GO are powerful in the context of network dynamics. However, the approach has limitations in particular for the analysis of CRISPR screenings, in which elements within a protein complex rather than complete pathways might have a strong selection. So we incorporate protein complex resource from CORUM database, which enable the identification of essential protein complexes from the CRISPR screens.
enrichPro = EnrichAnalyzer(genelist[1:200], type = "Complex", organism = "mmu")
EnrichedView(enrichPro, top = 5)
Usually, only the core genes in a pathway could be selected in a CRISPR screen, so we recommend to perform enrichment analysis on small pathways instead of pathways involving hundreds or even more genes.
enrich = EnrichAnalyzer(genelist[1:200], type = "GOBP", limit = c(2, 80), organism = "mmu")
EnrichedView(enrich, top = 5)
EnrichedFilter
.The EnrichedFilter
function tries to eliminate redundant pathways from the enrichment results by calculating the Jaccard index between pathways.
enrich1 = EnrichAnalyzer(genelist[1:200], type = "Pathway", organism = "mmu")
enrich2 = EnrichedFilter(enrich1)
EnrichedView(enrich1, top = 15)
EnrichedView(enrich2, top = 15)
sessionInfo()
## R version 4.3.0 RC (2023-04-13 r84269)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.2 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.17-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] enrichplot_1.20.0 ggplot2_3.4.2 clusterProfiler_4.8.0
## [4] MAGeCKFlute_2.4.0 BiocStyle_2.28.0
##
## loaded via a namespace (and not attached):
## [1] splines_4.3.0 later_1.3.0
## [3] bitops_1.0-7 ggplotify_0.1.0
## [5] filelock_1.0.2 tibble_3.2.1
## [7] polyclip_1.10-4 graph_1.78.0
## [9] XML_3.99-0.14 lifecycle_1.0.3
## [11] edgeR_3.42.0 lattice_0.21-8
## [13] MASS_7.3-59 magrittr_2.0.3
## [15] limma_3.56.0 sass_0.4.5
## [17] rmarkdown_2.21 jquerylib_0.1.4
## [19] yaml_2.3.7 httpuv_1.6.9
## [21] depmap_1.13.0 cowplot_1.1.1
## [23] DBI_1.1.3 RColorBrewer_1.1-3
## [25] zlibbioc_1.46.0 purrr_1.0.1
## [27] ggraph_2.1.0 BiocGenerics_0.46.0
## [29] msigdbr_7.5.1 RCurl_1.98-1.12
## [31] yulab.utils_0.0.6 tweenr_2.0.2
## [33] rappdirs_0.3.3 sva_3.48.0
## [35] GenomeInfoDbData_1.2.10 IRanges_2.34.0
## [37] S4Vectors_0.38.0 ggrepel_0.9.3
## [39] tidytree_0.4.2 genefilter_1.82.0
## [41] pheatmap_1.0.12 annotate_1.78.0
## [43] codetools_0.2-19 DOSE_3.26.0
## [45] ggforce_0.4.1 tidyselect_1.2.0
## [47] aplot_0.1.10 farver_2.1.1
## [49] viridis_0.6.2 matrixStats_0.63.0
## [51] stats4_4.3.0 BiocFileCache_2.8.0
## [53] pathview_1.40.0 jsonlite_1.8.4
## [55] ellipsis_0.3.2 tidygraph_1.2.3
## [57] survival_3.5-5 ggnewscale_0.4.8
## [59] tools_4.3.0 treeio_1.24.0
## [61] Rcpp_1.0.10 glue_1.6.2
## [63] gridExtra_2.3 xfun_0.39
## [65] mgcv_1.8-42 qvalue_2.32.0
## [67] MatrixGenerics_1.12.0 GenomeInfoDb_1.36.0
## [69] dplyr_1.1.2 withr_2.5.0
## [71] BiocManager_1.30.20 fastmap_1.1.1
## [73] fansi_1.0.4 digest_0.6.31
## [75] R6_2.5.1 mime_0.12
## [77] gridGraphics_0.5-1 colorspace_2.1-0
## [79] GO.db_3.17.0 RSQLite_2.3.1
## [81] utf8_1.2.3 tidyr_1.3.0
## [83] generics_0.1.3 data.table_1.14.8
## [85] graphlayouts_0.8.4 httr_1.4.5
## [87] scatterpie_0.1.9 pkgconfig_2.0.3
## [89] gtable_0.3.3 blob_1.2.4
## [91] XVector_0.40.0 shadowtext_0.1.2
## [93] htmltools_0.5.5 bookdown_0.33
## [95] fgsea_1.26.0 scales_1.2.1
## [97] Biobase_2.60.0 png_0.1-8
## [99] ggfun_0.0.9 knitr_1.42
## [101] reshape2_1.4.4 nlme_3.1-162
## [103] curl_5.0.0 org.Hs.eg.db_3.17.0
## [105] cachem_1.0.7 stringr_1.5.0
## [107] BiocVersion_3.17.1 parallel_4.3.0
## [109] HDO.db_0.99.1 AnnotationDbi_1.62.0
## [111] pillar_1.9.0 grid_4.3.0
## [113] vctrs_0.6.2 promises_1.2.0.1
## [115] dbplyr_2.3.2 xtable_1.8-4
## [117] Rgraphviz_2.44.0 evaluate_0.20
## [119] KEGGgraph_1.60.0 magick_2.7.4
## [121] cli_3.6.1 locfit_1.5-9.7
## [123] compiler_4.3.0 rlang_1.1.0
## [125] crayon_1.5.2 labeling_0.4.2
## [127] plyr_1.8.8 stringi_1.7.12
## [129] viridisLite_0.4.1 BiocParallel_1.34.0
## [131] babelgene_22.9 munsell_0.5.0
## [133] Biostrings_2.68.0 lazyeval_0.2.2
## [135] GOSemSim_2.26.0 Matrix_1.5-4
## [137] ExperimentHub_2.8.0 patchwork_1.1.2
## [139] bit64_4.0.5 KEGGREST_1.40.0
## [141] shiny_1.7.4 interactiveDisplayBase_1.38.0
## [143] highr_0.10 AnnotationHub_3.8.0
## [145] igraph_1.4.2 memoise_2.0.1
## [147] bslib_0.4.2 ggtree_3.8.0
## [149] fastmatch_1.1-3 bit_4.0.5
## [151] downloader_0.4 ape_5.7-1
## [153] gson_0.1.0
Yu, Guangchuang. 2018. Enrichplot: Visualization of Functional Enrichment Result. https://github.com/GuangchuangYu/enrichplot.
Yu, Guangchuang, Li-Gen Wang, Yanyan Han, and Qing-Yu He. 2012. “ClusterProfiler: An R Package for Comparing Biological Themes Among Gene Clusters.” OMICS: A Journal of Integrative Biology 16 (5): 284–87. https://doi.org/10.1089/omi.2011.0118.