Contents

1 DE analysis with Raw TCGA data using Bioconductor’s ExperimentHub and DESeq2

TCGA re-processed RNA-Seq data from 9264 Tumor Samples and 741 normal samples across 24 cancer types and made it available via GSE62944 from GEO. This data is also available as an ExpressionSet from ExperimentHub and can be used for Differential Expression Analysis.

In the below example, we show how one can download this dataset from ExperimentHub.

library(ExperimentHub)
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     Filter, Find, Map, Position, Reduce, anyDuplicated, aperm, append,
##     as.data.frame, basename, cbind, colnames, dirname, do.call,
##     duplicated, eval, evalq, get, grep, grepl, intersect, is.unsorted,
##     lapply, mapply, match, mget, order, paste, pmax, pmax.int, pmin,
##     pmin.int, rank, rbind, rownames, sapply, saveRDS, setdiff, table,
##     tapply, union, unique, unsplit, which.max, which.min
## Loading required package: AnnotationHub
## Loading required package: BiocFileCache
## Loading required package: dbplyr
eh = ExperimentHub()
query(eh , "GSE62944")
## ExperimentHub with 3 records
## # snapshotDate(): 2024-10-24
## # $dataprovider: GEO
## # $species: Homo sapiens
## # $rdataclass: SummarizedExperiment, ExpressionSet
## # additional mcols(): taxonomyid, genome, description,
## #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
## #   rdatapath, sourceurl, sourcetype 
## # retrieve records with, e.g., 'object[["EH1"]]' 
## 
##            title                                                              
##   EH1    | RNA-Sequencing and clinical data for 7706 tumor samples from The...
##   EH1043 | RNA-Sequencing and clinical data for 9246 tumor samples from The...
##   EH1044 | RNA-Sequencing and clinical data for 741 normal samples from The...

One can then extract the data for this using

tcga_data <- eh[["EH1"]]
## see ?GSE62944 and browseVignettes('GSE62944') for documentation
## loading from cache

The different cancer types can be accessed using -

 head(phenoData(tcga_data)$CancerType)
## [1] GBM GBM GBM GBM GBM OV 
## 20 Levels: BLCA BRCA COAD GBM HNSC KICH KIRC KIRP LAML LGG LIHC LUAD ... UCEC

Above we show only the top 6 Cancer subtypes.

1.1 Case Study

We are interested in identifying the IDH1 mutant and IDH1 wild type samples from TCGA’s Low Grade Glioma Samples and then conducting a differential expression analysis using DESeq2

# subset the expression Set to contain only samples from LGG.
lgg_data <- tcga_data[, which(phenoData(tcga_data)$CancerType=="LGG")]

# extract the IDHI mutant samples
mut_idx <- which(phenoData(lgg_data)$idh1_mutation_found=="YES")
mut_data <- exprs(lgg_data)[, mut_idx]

# extract the IDH1 WT samples
wt_idx <- which(phenoData(lgg_data)$idh1_mutation_found=="NO")
wt_data <- exprs(lgg_data)[, wt_idx]

# make a countTable.
countData <- cbind(mut_data, wt_data)

# for DE analysis with DESeq2 we need a sampleTable
samples= c(colnames(mut_data), colnames(wt_data))
group =c(rep("mut",length(mut_idx)), rep("wt", length(wt_idx)))
coldata <- cbind(samples, group)
colnames(coldata) <- c("sampleName", "Group")
coldata[,"Group"] <- factor(coldata[,"Group"], c("wt","mut"))

# Now we can run DE analysis
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     I, expand.grid, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## The following object is masked from 'package:Biobase':
## 
##     rowMedians
ddsMat <- DESeqDataSetFromMatrix(countData = countData,
                                 colData = DataFrame(coldata),
                                 design = ~ Group)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds <- ddsMat
dds <- dds[ rowSums(counts(dds)) > 1, ]
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 865 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(dds) 
summary(res)
## 
## out of 22546 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 2892, 13%
## LFC < 0 (down)     : 5094, 23%
## outliers [1]       : 0, 0%
## low counts [2]     : 1749, 7.8%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

For a detailed RNASeq analysis see Mike Love’s RNASeq workflow

2 sessionInfo()

sessionInfo()
## R version 4.4.1 (2024-06-14)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.1 LTS
## 
## Matrix products: default
## BLAS:   /home/biocbuild/bbs-3.20-bioc/R/lib/libRblas.so 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB              LC_COLLATE=C              
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/New_York
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] DESeq2_1.46.0               SummarizedExperiment_1.36.0
##  [3] MatrixGenerics_1.18.0       matrixStats_1.4.1          
##  [5] GenomicRanges_1.58.0        GenomeInfoDb_1.42.0        
##  [7] IRanges_2.40.0              S4Vectors_0.44.0           
##  [9] GSE62944_1.34.0             GEOquery_2.74.0            
## [11] Biobase_2.66.0              ExperimentHub_2.14.0       
## [13] AnnotationHub_3.14.0        BiocFileCache_2.14.0       
## [15] dbplyr_2.5.0                BiocGenerics_0.52.0        
## [17] BiocStyle_2.34.0           
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1        dplyr_1.1.4             blob_1.2.4             
##  [4] filelock_1.0.3          Biostrings_2.74.0       fastmap_1.2.0          
##  [7] XML_3.99-0.17           digest_0.6.37           mime_0.12              
## [10] lifecycle_1.0.4         statmod_1.5.0           KEGGREST_1.46.0        
## [13] RSQLite_2.3.7           magrittr_2.0.3          compiler_4.4.1         
## [16] rlang_1.1.4             sass_0.4.9              tools_4.4.1            
## [19] utf8_1.2.4              yaml_2.3.10             data.table_1.16.2      
## [22] knitr_1.48              S4Arrays_1.6.0          bit_4.5.0              
## [25] curl_5.2.3              DelayedArray_0.32.0     xml2_1.3.6             
## [28] BiocParallel_1.40.0     abind_1.4-8             withr_3.0.2            
## [31] purrr_1.0.2             grid_4.4.1              fansi_1.0.6            
## [34] colorspace_2.1-1        ggplot2_3.5.1           scales_1.3.0           
## [37] cli_3.6.3               rmarkdown_2.28          crayon_1.5.3           
## [40] generics_0.1.3          httr_1.4.7              tzdb_0.4.0             
## [43] DBI_1.2.3               cachem_1.1.0            zlibbioc_1.52.0        
## [46] parallel_4.4.1          AnnotationDbi_1.68.0    BiocManager_1.30.25    
## [49] XVector_0.46.0          vctrs_0.6.5             Matrix_1.7-1           
## [52] jsonlite_1.8.9          bookdown_0.41           hms_1.1.3              
## [55] bit64_4.5.2             locfit_1.5-9.10         limma_3.62.0           
## [58] jquerylib_0.1.4         tidyr_1.3.1             glue_1.8.0             
## [61] codetools_0.2-20        gtable_0.3.6            BiocVersion_3.20.0     
## [64] UCSC.utils_1.2.0        munsell_0.5.1           tibble_3.2.1           
## [67] pillar_1.9.0            rappdirs_0.3.3          htmltools_0.5.8.1      
## [70] GenomeInfoDbData_1.2.13 R6_2.5.1                evaluate_1.0.1         
## [73] lattice_0.22-6          readr_2.1.5             rentrez_1.2.3          
## [76] png_0.1-8               memoise_2.0.1           bslib_0.8.0            
## [79] Rcpp_1.0.13             SparseArray_1.6.0       xfun_0.48              
## [82] pkgconfig_2.0.3