In the Bioconductor annotation ecosystem, there are TxDb.* packages which provide data for Gene Ontology gene sets. The TxDb.* packages supported in rGREAT are:
library(rGREAT)
rGREAT:::BIOC_ANNO_PKGS$txdb
## [1] "TxDb.Hsapiens.UCSC.hg18.knownGene"
## [2] "TxDb.Hsapiens.UCSC.hg19.knownGene"
## [3] "TxDb.Hsapiens.UCSC.hg38.knownGene"
## [4] "TxDb.Hsapiens.UCSC.hg38.refGene"
## [5] "TxDb.Mmusculus.UCSC.mm10.knownGene"
## [6] "TxDb.Mmusculus.UCSC.mm10.ensGene"
## [7] "TxDb.Mmusculus.UCSC.mm39.refGene"
## [8] "TxDb.Mmusculus.UCSC.mm9.knownGene"
## [9] "TxDb.Rnorvegicus.UCSC.rn4.ensGene"
## [10] "TxDb.Rnorvegicus.UCSC.rn5.refGene"
## [11] "TxDb.Rnorvegicus.UCSC.rn6.refGene"
## [12] "TxDb.Rnorvegicus.UCSC.rn7.refGene"
## [13] "TxDb.Ggallus.UCSC.galGal4.refGene"
## [14] "TxDb.Ggallus.UCSC.galGal5.refGene"
## [15] "TxDb.Ggallus.UCSC.galGal6.refGene"
## [16] "TxDb.Mmulatta.UCSC.rheMac10.refGene"
## [17] "TxDb.Mmulatta.UCSC.rheMac3.refGene"
## [18] "TxDb.Mmulatta.UCSC.rheMac8.refGene"
## [19] "TxDb.Celegans.UCSC.ce11.refGene"
## [20] "TxDb.Celegans.UCSC.ce11.ensGene"
## [21] "TxDb.Cfamiliaris.UCSC.canFam3.refGene"
## [22] "TxDb.Cfamiliaris.UCSC.canFam4.refGene"
## [23] "TxDb.Cfamiliaris.UCSC.canFam5.refGene"
## [24] "TxDb.Sscrofa.UCSC.susScr11.refGene"
## [25] "TxDb.Sscrofa.UCSC.susScr3.refGene"
## [26] "TxDb.Scerevisiae.UCSC.sacCer2.sgdGene"
## [27] "TxDb.Scerevisiae.UCSC.sacCer3.sgdGene"
## [28] "TxDb.Ptroglodytes.UCSC.panTro4.refGene"
## [29] "TxDb.Ptroglodytes.UCSC.panTro5.refGene"
## [30] "TxDb.Ptroglodytes.UCSC.panTro6.refGene"
## [31] "TxDb.Dmelanogaster.UCSC.dm3.ensGene"
## [32] "TxDb.Dmelanogaster.UCSC.dm6.ensGene"
## [33] "TxDb.Drerio.UCSC.danRer10.refGene"
## [34] "TxDb.Drerio.UCSC.danRer11.refGene"
## [35] "TxDb.Btaurus.UCSC.bosTau8.refGene"
## [36] "TxDb.Btaurus.UCSC.bosTau9.refGene"
## [37] "TxDb.Athaliana.BioMart.plantsmart51"
## [38] "TxDb.Athaliana.BioMart.plantsmart22"
## [39] "TxDb.Athaliana.BioMart.plantsmart25"
## [40] "TxDb.Athaliana.BioMart.plantsmart28"
To perform GREAT anlaysis with GO gene sets for other organisms, you can either specify the genome version:
great(gr, "GO:BP", "galGal6")
or with the full name of the corresponding TxDb package:
great(gr, "GO:BP", "TxDb.Ggallus.UCSC.galGal6.refGene")
These two are internally the same.
You can specify a BioMart dataset (which corresponds to a specific organism), e.g.:
# Giant panda
great(gr, "GO:BP", biomart_dataset = "amelanoleuca_gene_ensembl")
A full list of supported BioMart datasets (organisms) can be found with the function BioMartGOGeneSets::supportedOrganisms()
.
MSigDB contains gene sets only for human, but it can be extended to other organisms by mapping to the homologues genes. The package msigdbr has already mapped genes to many other organisms. A full list of supported organisms in msigdbr can be obtained by:
library(msigdbr)
msigdbr_species()
## # A tibble: 20 × 2
## species_name species_common_name
## <chr> <chr>
## 1 Anolis carolinensis Carolina anole, green anole
## 2 Bos taurus bovine, cattle, cow, dairy cow, domestic cat…
## 3 Caenorhabditis elegans <NA>
## 4 Canis lupus familiaris dog, dogs
## 5 Danio rerio leopard danio, zebra danio, zebra fish, zebr…
## 6 Drosophila melanogaster fruit fly
## 7 Equus caballus domestic horse, equine, horse
## 8 Felis catus cat, cats, domestic cat
## 9 Gallus gallus bantam, chicken, chickens, Gallus domesticus
## 10 Homo sapiens human
## 11 Macaca mulatta rhesus macaque, rhesus macaques, Rhesus monk…
## 12 Monodelphis domestica gray short-tailed opossum
## 13 Mus musculus house mouse, mouse
## 14 Ornithorhynchus anatinus duck-billed platypus, duckbill platypus, pla…
## 15 Pan troglodytes chimpanzee
## 16 Rattus norvegicus brown rat, Norway rat, rat, rats
## 17 Saccharomyces cerevisiae baker's yeast, brewer's yeast, S. cerevisiae
## 18 Schizosaccharomyces pombe 972h- <NA>
## 19 Sus scrofa pig, pigs, swine, wild boar
## 20 Xenopus tropicalis tropical clawed frog, western clawed frog
To obtain gene sets for non-human organisms, e.g.:
h_gene_sets = msigdbr(species = "chimpanzee", category = "H")
head(h_gene_sets)
## # A tibble: 6 × 18
## gs_cat gs_subcat gs_name gene_…¹ entre…² ensem…³ human…⁴ human…⁵ human…⁶ gs_id
## <chr> <chr> <chr> <chr> <int> <chr> <chr> <int> <chr> <chr>
## 1 H "" HALLMA… ABCA1 464630 ENSPTR… ABCA1 19 ENSG00… M5905
## 2 H "" HALLMA… ABCB8 463892 ENSPTR… ABCB8 11194 ENSG00… M5905
## 3 H "" HALLMA… ACAA2 455414 ENSPTR… ACAA2 10449 ENSG00… M5905
## 4 H "" HALLMA… ACADL 459914 ENSPTR… ACADL 33 ENSG00… M5905
## 5 H "" HALLMA… ACADM 469356 ENSPTR… ACADM 34 ENSG00… M5905
## 6 H "" HALLMA… ACADS 742921 ENSPTR… ACADS 35 ENSG00… M5905
## # … with 8 more variables: gs_pmid <chr>, gs_geoid <chr>,
## # gs_exact_source <chr>, gs_url <chr>, gs_description <chr>, taxon_id <int>,
## # ortholog_sources <chr>, num_ortholog_sources <dbl>, and abbreviated
## # variable names ¹gene_symbol, ²entrez_gene, ³ensembl_gene,
## # ⁴human_gene_symbol, ⁵human_entrez_gene, ⁶human_ensembl_gene
If the organism you selected has a corresponding TxDb package available which provides TSS information, you need to make sure the gene sets use Entrez gene ID as the identifier (Most TxDb packages use Entrez ID as primary ID, you can check the variable rGREAT:::BIOC_ANNO_PKGS
).
# convert to a list of gene sets
h_gene_sets = split(h_gene_sets$entrez_gene, h_gene_sets$gs_name)
h_gene_sets = lapply(h_gene_sets, as.character) # just to make sure gene IDs are all in character.
h_gene_sets[1:2]
## $HALLMARK_ADIPOGENESIS
## [1] "464630" "463892" "455414" "459914" "469356" "742921"
## [7] "454672" "104003784" "454895" "451866" "737339" "471032"
## [13] "451742" "737305" "100615914" "456723" "107967644" "454362"
## [19] "464334" "743667" "741867" "449586" "100614256" "741708"
## [25] "459164" "746692" "473976" "452433" "468889" "745443"
## [31] "460926" "455644" "451116" "454684" "744890" "461229"
## [37] "740513" "104005232" "463949" "469319" "748673" "450673"
## [43] "468605" "471455" "456837" "464611" "452659" "472079"
## [49] "452307" "454118" "100616508" "465727" "742828" "737945"
## [55] "107976794" "107976794" "746229" "472893" "456557" "457056"
## [61] "747265" "736777" "464460" "451393" "745691" "454512"
## [67] "466780" "463861" "744984" "452566" "457117" NA
## [73] "747936" "459360" "461436" "464353" "464074" "466651"
## [79] "451984" "456243" "464255" "467738" "466732" "461244"
## [85] "456929" "460520" "450562" "450738" "464140" "459670"
## [91] "452976" "471703" "741876" "471135" "461424" "459828"
## [97] "452295" "460113" "453565" "741179" "747276" "470423"
## [103] "451967" "450290" "473975" "473975" "460157" "462946"
## [109] "449638" "738797" "456076" "451807" "464031" "739986"
## [115] "459173" "460872" "463484" "462853" "739167" "457477"
## [121] "742027" "746245" "472764" "747387" "744096" "101057233"
## [127] "744811" "463686" "744435" "468748" "451175" "460227"
## [133] "454744" "739996" NA "450735" "454478" "457929"
## [139] "738397" "458602" "456908" "451591" "450310" "107970333"
## [145] "465012" "463481" "463481" "460178" "470365" "742092"
## [151] "741184" "459094" "459374" "456940" "745779" "454531"
## [157] "737918" "107973114" "742100" "470420" "468499" "467657"
## [163] "100608935" "462416" "451281" "470281" "470281" "452359"
## [169] "456862" "456526" "747462" "474051" "456155" "458647"
## [175] "744390" "455841" "459096" "459031" "450574" "449637"
## [181] "450628" "470477" "471247" "453405" "739128" "454681"
## [187] "464707" "470417" "450933" "459685" "460443" "468406"
## [193] "458803" "467151" "464550" "745004" "451416" "735808"
## [199] "743144" "460348" "107974864" "471631" "741897" "463489"
##
## $HALLMARK_ALLOGRAFT_REJECTION
## [1] "454210" "461523" "450363" "100609296" "459646" "740898"
## [7] "466415" "450170" "465345" "456984" "744209" "449497"
## [13] "100608992" "459361" "741390" "468208" "748142" "473220"
## [19] "748205" "736543" "454593" "747004" "454579" "747123"
## [25] "462689" "460323" "740071" "450128" "469524" "449512"
## [31] "748272" "470617" "451584" "742330" "451585" "450124"
## [37] "100615583" "473802" "460569" "745293" "462191" "740560"
## [43] "470892" "470900" "735755" "470426" "460577" "465021"
## [49] "465607" "736196" "457127" "453745" "457277" "738275"
## [55] "471200" NA "459634" "457770" "469142" "463415"
## [61] "466216" "458797" "453714" "469204" "750603" NA
## [67] "100615835" "740028" "451695" "451158" "471510" "738331"
## [73] "469584" "739516" "465940" "461906" "468521" "457003"
## [79] "472959" "457020" "467610" "461873" "452825" "460623"
## [85] "463280" "746195" "750725" "100608816" "449592" "471979"
## [91] "471977" "471974" "462591" "462540" "494187" "450196"
## [97] "474132" "473965" "449517" "747276" "470077" "743102"
## [103] "472749" "469657" "736204" "460816" "471723" "455851"
## [109] "449564" "737808" "449644" "739011" "744277" "450200"
## [115] "461472" "456370" "450884" "470203" "101059843" "449565"
## [121] "454005" "463288" "464245" "745517" "463371" "470524"
## [127] "462386" "450927" "454294" "454045" "458607" "735556"
## [133] "464979" "450156" "738375" "456715" "471734" "736309"
## [139] "744486" "459682" "745667" "472771" "462888" "748652"
## [145] "449582" "458294" "460699" "459239" "741196" "460720"
## [151] "100322885" "469743" "455026" "740704" "740477" "450512"
## [157] "453993" "456276" "743176" "748032" "457607" "462249"
## [163] "464277" "737451" "746600" "737526" "456065" "461536"
## [169] "107966305" "746721" "737070" "459209" "451169" "450503"
## [175] "461971" "461023" "459834" "100610925" "471978" "746399"
## [181] "746814" "456060" "457742" "451611" "107971092" "461167"
## [187] "471325" "471374" "471167" "494186" "748737" "464876"
## [193] "741922" "745141" "452125" "453161" "743187" "459427"
Now we can perform the local GREAT analysis.
great(gr, h_gene_sets, "panTro6")
Since great()
allows both self-defined TSS and gene sets, this means great()
can be independent to organisms. Please refer to the vignette “Analyze with local GREAT” to find out how to manuallly set both TSS and gene sets.