Use TxDb packages

In the Bioconductor annotation ecosystem, there are TxDb.* packages which provide data for Gene Ontology gene sets. The TxDb.* packages supported in rGREAT are:

library(rGREAT)
rGREAT:::BIOC_ANNO_PKGS$txdb
##  [1] "TxDb.Hsapiens.UCSC.hg18.knownGene"     
##  [2] "TxDb.Hsapiens.UCSC.hg19.knownGene"     
##  [3] "TxDb.Hsapiens.UCSC.hg38.knownGene"     
##  [4] "TxDb.Hsapiens.UCSC.hg38.refGene"       
##  [5] "TxDb.Mmusculus.UCSC.mm10.knownGene"    
##  [6] "TxDb.Mmusculus.UCSC.mm10.ensGene"      
##  [7] "TxDb.Mmusculus.UCSC.mm39.refGene"      
##  [8] "TxDb.Mmusculus.UCSC.mm9.knownGene"     
##  [9] "TxDb.Rnorvegicus.UCSC.rn4.ensGene"     
## [10] "TxDb.Rnorvegicus.UCSC.rn5.refGene"     
## [11] "TxDb.Rnorvegicus.UCSC.rn6.refGene"     
## [12] "TxDb.Rnorvegicus.UCSC.rn7.refGene"     
## [13] "TxDb.Ggallus.UCSC.galGal4.refGene"     
## [14] "TxDb.Ggallus.UCSC.galGal5.refGene"     
## [15] "TxDb.Ggallus.UCSC.galGal6.refGene"     
## [16] "TxDb.Mmulatta.UCSC.rheMac10.refGene"   
## [17] "TxDb.Mmulatta.UCSC.rheMac3.refGene"    
## [18] "TxDb.Mmulatta.UCSC.rheMac8.refGene"    
## [19] "TxDb.Celegans.UCSC.ce11.refGene"       
## [20] "TxDb.Celegans.UCSC.ce11.ensGene"       
## [21] "TxDb.Cfamiliaris.UCSC.canFam3.refGene" 
## [22] "TxDb.Cfamiliaris.UCSC.canFam4.refGene" 
## [23] "TxDb.Cfamiliaris.UCSC.canFam5.refGene" 
## [24] "TxDb.Sscrofa.UCSC.susScr11.refGene"    
## [25] "TxDb.Sscrofa.UCSC.susScr3.refGene"     
## [26] "TxDb.Scerevisiae.UCSC.sacCer2.sgdGene" 
## [27] "TxDb.Scerevisiae.UCSC.sacCer3.sgdGene" 
## [28] "TxDb.Ptroglodytes.UCSC.panTro4.refGene"
## [29] "TxDb.Ptroglodytes.UCSC.panTro5.refGene"
## [30] "TxDb.Ptroglodytes.UCSC.panTro6.refGene"
## [31] "TxDb.Dmelanogaster.UCSC.dm3.ensGene"   
## [32] "TxDb.Dmelanogaster.UCSC.dm6.ensGene"   
## [33] "TxDb.Drerio.UCSC.danRer10.refGene"     
## [34] "TxDb.Drerio.UCSC.danRer11.refGene"     
## [35] "TxDb.Btaurus.UCSC.bosTau8.refGene"     
## [36] "TxDb.Btaurus.UCSC.bosTau9.refGene"     
## [37] "TxDb.Athaliana.BioMart.plantsmart51"   
## [38] "TxDb.Athaliana.BioMart.plantsmart22"   
## [39] "TxDb.Athaliana.BioMart.plantsmart25"   
## [40] "TxDb.Athaliana.BioMart.plantsmart28"

To perform GREAT anlaysis with GO gene sets for other organisms, you can either specify the genome version:

great(gr, "GO:BP", "galGal6")

or with the full name of the corresponding TxDb package:

great(gr, "GO:BP", "TxDb.Ggallus.UCSC.galGal6.refGene")

These two are internally the same.

Use BioMart GO gene sets

You can specify a BioMart dataset (which corresponds to a specific organism), e.g.:

# Giant panda
great(gr, "GO:BP", biomart_dataset = "amelanoleuca_gene_ensembl")

A full list of supported BioMart datasets (organisms) can be found with the function BioMartGOGeneSets::supportedOrganisms().

Use MSigDB gene sets

MSigDB contains gene sets only for human, but it can be extended to other organisms by mapping to the homologues genes. The package msigdbr has already mapped genes to many other organisms. A full list of supported organisms in msigdbr can be obtained by:

library(msigdbr)
msigdbr_species()
## # A tibble: 20 × 2
##    species_name                    species_common_name                          
##    <chr>                           <chr>                                        
##  1 Anolis carolinensis             Carolina anole, green anole                  
##  2 Bos taurus                      bovine, cattle, cow, dairy cow, domestic cat…
##  3 Caenorhabditis elegans          <NA>                                         
##  4 Canis lupus familiaris          dog, dogs                                    
##  5 Danio rerio                     leopard danio, zebra danio, zebra fish, zebr…
##  6 Drosophila melanogaster         fruit fly                                    
##  7 Equus caballus                  domestic horse, equine, horse                
##  8 Felis catus                     cat, cats, domestic cat                      
##  9 Gallus gallus                   bantam, chicken, chickens, Gallus domesticus 
## 10 Homo sapiens                    human                                        
## 11 Macaca mulatta                  rhesus macaque, rhesus macaques, Rhesus monk…
## 12 Monodelphis domestica           gray short-tailed opossum                    
## 13 Mus musculus                    house mouse, mouse                           
## 14 Ornithorhynchus anatinus        duck-billed platypus, duckbill platypus, pla…
## 15 Pan troglodytes                 chimpanzee                                   
## 16 Rattus norvegicus               brown rat, Norway rat, rat, rats             
## 17 Saccharomyces cerevisiae        baker's yeast, brewer's yeast, S. cerevisiae 
## 18 Schizosaccharomyces pombe 972h- <NA>                                         
## 19 Sus scrofa                      pig, pigs, swine, wild boar                  
## 20 Xenopus tropicalis              tropical clawed frog, western clawed frog

To obtain gene sets for non-human organisms, e.g.:

h_gene_sets = msigdbr(species = "chimpanzee", category = "H")
head(h_gene_sets)
## # A tibble: 6 × 18
##   gs_cat gs_subcat gs_name gene_…¹ entre…² ensem…³ human…⁴ human…⁵ human…⁶ gs_id
##   <chr>  <chr>     <chr>   <chr>     <int> <chr>   <chr>     <int> <chr>   <chr>
## 1 H      ""        HALLMA… ABCA1    464630 ENSPTR… ABCA1        19 ENSG00… M5905
## 2 H      ""        HALLMA… ABCB8    463892 ENSPTR… ABCB8     11194 ENSG00… M5905
## 3 H      ""        HALLMA… ACAA2    455414 ENSPTR… ACAA2     10449 ENSG00… M5905
## 4 H      ""        HALLMA… ACADL    459914 ENSPTR… ACADL        33 ENSG00… M5905
## 5 H      ""        HALLMA… ACADM    469356 ENSPTR… ACADM        34 ENSG00… M5905
## 6 H      ""        HALLMA… ACADS    742921 ENSPTR… ACADS        35 ENSG00… M5905
## # … with 8 more variables: gs_pmid <chr>, gs_geoid <chr>,
## #   gs_exact_source <chr>, gs_url <chr>, gs_description <chr>, taxon_id <int>,
## #   ortholog_sources <chr>, num_ortholog_sources <dbl>, and abbreviated
## #   variable names ¹​gene_symbol, ²​entrez_gene, ³​ensembl_gene,
## #   ⁴​human_gene_symbol, ⁵​human_entrez_gene, ⁶​human_ensembl_gene

If the organism you selected has a corresponding TxDb package available which provides TSS information, you need to make sure the gene sets use Entrez gene ID as the identifier (Most TxDb packages use Entrez ID as primary ID, you can check the variable rGREAT:::BIOC_ANNO_PKGS).

# convert to a list of gene sets
h_gene_sets = split(h_gene_sets$entrez_gene, h_gene_sets$gs_name)
h_gene_sets = lapply(h_gene_sets, as.character)  # just to make sure gene IDs are all in character.
h_gene_sets[1:2]
## $HALLMARK_ADIPOGENESIS
##   [1] "464630"    "463892"    "455414"    "459914"    "469356"    "742921"   
##   [7] "454672"    "104003784" "454895"    "451866"    "737339"    "471032"   
##  [13] "451742"    "737305"    "100615914" "456723"    "107967644" "454362"   
##  [19] "464334"    "743667"    "741867"    "449586"    "100614256" "741708"   
##  [25] "459164"    "746692"    "473976"    "452433"    "468889"    "745443"   
##  [31] "460926"    "455644"    "451116"    "454684"    "744890"    "461229"   
##  [37] "740513"    "104005232" "463949"    "469319"    "748673"    "450673"   
##  [43] "468605"    "471455"    "456837"    "464611"    "452659"    "472079"   
##  [49] "452307"    "454118"    "100616508" "465727"    "742828"    "737945"   
##  [55] "107976794" "107976794" "746229"    "472893"    "456557"    "457056"   
##  [61] "747265"    "736777"    "464460"    "451393"    "745691"    "454512"   
##  [67] "466780"    "463861"    "744984"    "452566"    "457117"    NA         
##  [73] "747936"    "459360"    "461436"    "464353"    "464074"    "466651"   
##  [79] "451984"    "456243"    "464255"    "467738"    "466732"    "461244"   
##  [85] "456929"    "460520"    "450562"    "450738"    "464140"    "459670"   
##  [91] "452976"    "471703"    "741876"    "471135"    "461424"    "459828"   
##  [97] "452295"    "460113"    "453565"    "741179"    "747276"    "470423"   
## [103] "451967"    "450290"    "473975"    "473975"    "460157"    "462946"   
## [109] "449638"    "738797"    "456076"    "451807"    "464031"    "739986"   
## [115] "459173"    "460872"    "463484"    "462853"    "739167"    "457477"   
## [121] "742027"    "746245"    "472764"    "747387"    "744096"    "101057233"
## [127] "744811"    "463686"    "744435"    "468748"    "451175"    "460227"   
## [133] "454744"    "739996"    NA          "450735"    "454478"    "457929"   
## [139] "738397"    "458602"    "456908"    "451591"    "450310"    "107970333"
## [145] "465012"    "463481"    "463481"    "460178"    "470365"    "742092"   
## [151] "741184"    "459094"    "459374"    "456940"    "745779"    "454531"   
## [157] "737918"    "107973114" "742100"    "470420"    "468499"    "467657"   
## [163] "100608935" "462416"    "451281"    "470281"    "470281"    "452359"   
## [169] "456862"    "456526"    "747462"    "474051"    "456155"    "458647"   
## [175] "744390"    "455841"    "459096"    "459031"    "450574"    "449637"   
## [181] "450628"    "470477"    "471247"    "453405"    "739128"    "454681"   
## [187] "464707"    "470417"    "450933"    "459685"    "460443"    "468406"   
## [193] "458803"    "467151"    "464550"    "745004"    "451416"    "735808"   
## [199] "743144"    "460348"    "107974864" "471631"    "741897"    "463489"   
## 
## $HALLMARK_ALLOGRAFT_REJECTION
##   [1] "454210"    "461523"    "450363"    "100609296" "459646"    "740898"   
##   [7] "466415"    "450170"    "465345"    "456984"    "744209"    "449497"   
##  [13] "100608992" "459361"    "741390"    "468208"    "748142"    "473220"   
##  [19] "748205"    "736543"    "454593"    "747004"    "454579"    "747123"   
##  [25] "462689"    "460323"    "740071"    "450128"    "469524"    "449512"   
##  [31] "748272"    "470617"    "451584"    "742330"    "451585"    "450124"   
##  [37] "100615583" "473802"    "460569"    "745293"    "462191"    "740560"   
##  [43] "470892"    "470900"    "735755"    "470426"    "460577"    "465021"   
##  [49] "465607"    "736196"    "457127"    "453745"    "457277"    "738275"   
##  [55] "471200"    NA          "459634"    "457770"    "469142"    "463415"   
##  [61] "466216"    "458797"    "453714"    "469204"    "750603"    NA         
##  [67] "100615835" "740028"    "451695"    "451158"    "471510"    "738331"   
##  [73] "469584"    "739516"    "465940"    "461906"    "468521"    "457003"   
##  [79] "472959"    "457020"    "467610"    "461873"    "452825"    "460623"   
##  [85] "463280"    "746195"    "750725"    "100608816" "449592"    "471979"   
##  [91] "471977"    "471974"    "462591"    "462540"    "494187"    "450196"   
##  [97] "474132"    "473965"    "449517"    "747276"    "470077"    "743102"   
## [103] "472749"    "469657"    "736204"    "460816"    "471723"    "455851"   
## [109] "449564"    "737808"    "449644"    "739011"    "744277"    "450200"   
## [115] "461472"    "456370"    "450884"    "470203"    "101059843" "449565"   
## [121] "454005"    "463288"    "464245"    "745517"    "463371"    "470524"   
## [127] "462386"    "450927"    "454294"    "454045"    "458607"    "735556"   
## [133] "464979"    "450156"    "738375"    "456715"    "471734"    "736309"   
## [139] "744486"    "459682"    "745667"    "472771"    "462888"    "748652"   
## [145] "449582"    "458294"    "460699"    "459239"    "741196"    "460720"   
## [151] "100322885" "469743"    "455026"    "740704"    "740477"    "450512"   
## [157] "453993"    "456276"    "743176"    "748032"    "457607"    "462249"   
## [163] "464277"    "737451"    "746600"    "737526"    "456065"    "461536"   
## [169] "107966305" "746721"    "737070"    "459209"    "451169"    "450503"   
## [175] "461971"    "461023"    "459834"    "100610925" "471978"    "746399"   
## [181] "746814"    "456060"    "457742"    "451611"    "107971092" "461167"   
## [187] "471325"    "471374"    "471167"    "494186"    "748737"    "464876"   
## [193] "741922"    "745141"    "452125"    "453161"    "743187"    "459427"

Now we can perform the local GREAT analysis.

great(gr, h_gene_sets, "panTro6")

Self-define TSS and gene sets

Since great() allows both self-defined TSS and gene sets, this means great() can be independent to organisms. Please refer to the vignette “Analyze with local GREAT” to find out how to manuallly set both TSS and gene sets.