TileDBArray 1.2.1
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.22561980 0.84909115 0.08417527 . -0.7496890 2.1553773
## [2,] 0.64475488 -1.04585397 0.16569355 . -0.1805628 -0.8105910
## [3,] -0.50659263 2.42292738 -1.15445290 . 0.3001958 -2.0641087
## [4,] -0.37223217 0.77504604 -0.06959078 . 1.7287019 -0.1470217
## [5,] 1.33763338 -0.05339304 0.61029657 . -0.9929121 1.7691759
## ... . . . . . .
## [96,] -0.07946108 -0.94539604 -0.73827091 . -0.8310922 1.0824761
## [97,] 1.36283165 0.59184435 1.23353921 . -1.2149245 0.3298639
## [98,] 0.19398116 0.81909892 -0.30242949 . 0.7901528 -0.6793461
## [99,] -1.46537506 2.60446017 1.03067760 . -0.2593944 -0.8541233
## [100,] -1.58192096 -0.26567306 -0.53272213 . -0.4976779 0.6168890
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.22561980 0.84909115 0.08417527 . -0.7496890 2.1553773
## [2,] 0.64475488 -1.04585397 0.16569355 . -0.1805628 -0.8105910
## [3,] -0.50659263 2.42292738 -1.15445290 . 0.3001958 -2.0641087
## [4,] -0.37223217 0.77504604 -0.06959078 . 1.7287019 -0.1470217
## [5,] 1.33763338 -0.05339304 0.61029657 . -0.9929121 1.7691759
## ... . . . . . .
## [96,] -0.07946108 -0.94539604 -0.73827091 . -0.8310922 1.0824761
## [97,] 1.36283165 0.59184435 1.23353921 . -1.2149245 0.3298639
## [98,] 0.19398116 0.81909892 -0.30242949 . 0.7901528 -0.6793461
## [99,] -1.46537506 2.60446017 1.03067760 . -0.2593944 -0.8541233
## [100,] -1.58192096 -0.26567306 -0.53272213 . -0.4976779 0.6168890
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0.00 0.00 0.00 . -1.3 0.0
## [997,] 0.00 0.00 0.00 . 0.0 0.0
## [998,] 0.00 0.00 0.00 . 0.0 0.0
## [999,] 0.00 0.00 -0.49 . 0.0 0.0
## [1000,] 0.00 0.00 -0.59 . 0.0 0.0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.22561980 0.84909115 0.08417527 . -0.7496890 2.1553773
## GENE_2 0.64475488 -1.04585397 0.16569355 . -0.1805628 -0.8105910
## GENE_3 -0.50659263 2.42292738 -1.15445290 . 0.3001958 -2.0641087
## GENE_4 -0.37223217 0.77504604 -0.06959078 . 1.7287019 -0.1470217
## GENE_5 1.33763338 -0.05339304 0.61029657 . -0.9929121 1.7691759
## ... . . . . . .
## GENE_96 -0.07946108 -0.94539604 -0.73827091 . -0.8310922 1.0824761
## GENE_97 1.36283165 0.59184435 1.23353921 . -1.2149245 0.3298639
## GENE_98 0.19398116 0.81909892 -0.30242949 . 0.7901528 -0.6793461
## GENE_99 -1.46537506 2.60446017 1.03067760 . -0.2593944 -0.8541233
## GENE_100 -1.58192096 -0.26567306 -0.53272213 . -0.4976779 0.6168890
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## GENE_1 GENE_2 GENE_3 GENE_4 GENE_5 GENE_6
## 0.2256198 0.6447549 -0.5065926 -0.3722322 1.3376334 0.3251615
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 0.22561980 0.84909115 0.08417527 0.43226684 0.30195842
## GENE_2 0.64475488 -1.04585397 0.16569355 0.16733965 0.19326328
## GENE_3 -0.50659263 2.42292738 -1.15445290 -1.20177684 0.71832298
## GENE_4 -0.37223217 0.77504604 -0.06959078 -1.18060384 0.75020450
## GENE_5 1.33763338 -0.05339304 0.61029657 0.96514016 -1.04167284
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.4512396 1.6981823 0.1683505 . -1.4993781 4.3107545
## GENE_2 1.2895098 -2.0917079 0.3313871 . -0.3611255 -1.6211820
## GENE_3 -1.0131853 4.8458548 -2.3089058 . 0.6003915 -4.1282173
## GENE_4 -0.7444643 1.5500921 -0.1391816 . 3.4574037 -0.2940435
## GENE_5 2.6752668 -0.1067861 1.2205931 . -1.9858242 3.5383519
## ... . . . . . .
## GENE_96 -0.1589222 -1.8907921 -1.4765418 . -1.6621845 2.1649523
## GENE_97 2.7256633 1.1836887 2.4670784 . -2.4298490 0.6597277
## GENE_98 0.3879623 1.6381978 -0.6048590 . 1.5803056 -1.3586922
## GENE_99 -2.9307501 5.2089203 2.0613552 . -0.5187887 -1.7082465
## GENE_100 -3.1638419 -0.5313461 -1.0654443 . -0.9953558 1.2337780
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6
## -10.89503403 -1.48044305 -6.47904208 2.60804095 0.99402146 4.57520668
## SAMP_7 SAMP_8 SAMP_9 SAMP_10
## 0.24124000 5.31144347 -14.79323080 0.01215487
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 1.105427
## GENE_2 -0.332452
## GENE_3 1.912516
## GENE_4 1.224523
## GENE_5 1.829164
## ... .
## GENE_96 -2.1471692
## GENE_97 0.7988225
## GENE_98 0.4377561
## GENE_99 5.4337019
## GENE_100 0.2668242
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.80926284 1.21406533 1.52926633 . -0.8987714 -0.1524675
## [2,] 1.59529342 0.72793134 -0.88332667 . -1.5710641 -0.7196544
## [3,] -0.19516374 1.40518207 -0.04645728 . -0.7133946 0.9385561
## [4,] -0.68232865 -0.40209463 -0.06366155 . -0.5223599 -0.4269877
## [5,] -0.69078102 0.11897049 0.05646233 . 1.6526142 -2.0277854
## ... . . . . . .
## [96,] -0.6268708 -1.0913551 0.1972029 . -0.03715514 0.97105680
## [97,] 0.6124675 -1.4005044 -3.3787824 . 1.01293465 -1.21578233
## [98,] 0.4029066 1.2460312 -0.1246861 . -0.30401224 -2.08973585
## [99,] 0.4024117 0.1428827 -0.5445302 . -0.09476695 -0.09583711
## [100,] -1.8985583 -1.2193605 0.1226939 . -0.72329644 1.69266908
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.80926284 1.21406533 1.52926633 . -0.8987714 -0.1524675
## [2,] 1.59529342 0.72793134 -0.88332667 . -1.5710641 -0.7196544
## [3,] -0.19516374 1.40518207 -0.04645728 . -0.7133946 0.9385561
## [4,] -0.68232865 -0.40209463 -0.06366155 . -0.5223599 -0.4269877
## [5,] -0.69078102 0.11897049 0.05646233 . 1.6526142 -2.0277854
## ... . . . . . .
## [96,] -0.6268708 -1.0913551 0.1972029 . -0.03715514 0.97105680
## [97,] 0.6124675 -1.4005044 -3.3787824 . 1.01293465 -1.21578233
## [98,] 0.4029066 1.2460312 -0.1246861 . -0.30401224 -2.08973585
## [99,] 0.4024117 0.1428827 -0.5445302 . -0.09476695 -0.09583711
## [100,] -1.8985583 -1.2193605 0.1226939 . -0.72329644 1.69266908
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] TileDBArray_1.2.1 DelayedArray_0.18.0 IRanges_2.26.0
## [4] S4Vectors_0.30.0 MatrixGenerics_1.4.0 matrixStats_0.58.0
## [7] BiocGenerics_0.38.0 Matrix_1.3-3 BiocStyle_2.20.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.6 bslib_0.2.5.1 compiler_4.1.0
## [4] BiocManager_1.30.15 jquerylib_0.1.4 tools_4.1.0
## [7] digest_0.6.27 bit_4.0.4 jsonlite_1.7.2
## [10] evaluate_0.14 lattice_0.20-44 nanotime_0.3.2
## [13] rlang_0.4.11 RcppCCTZ_0.2.9 yaml_2.2.1
## [16] xfun_0.23 stringr_1.4.0 knitr_1.33
## [19] sass_0.4.0 bit64_4.0.5 grid_4.1.0
## [22] R6_2.5.0 rmarkdown_2.8 bookdown_0.22
## [25] tiledb_0.9.2 magrittr_2.0.1 htmltools_0.5.1.1
## [28] stringi_1.6.2 zoo_1.8-9