TileDBArray 1.0.0
TileDB implements a framework for local and remote storage of dense and sparse arrays.
We can use this as a DelayedArray
backend to provide an array-level abstraction,
thus allowing the data to be used in many places where an ordinary array or matrix might be used.
The TileDBArray package implements the necessary wrappers around TileDB-R
to support read/write operations on TileDB arrays within the DelayedArray framework.
TileDBArray
Creating a TileDBArray
is as easy as:
X <- matrix(rnorm(1000), ncol=10)
library(TileDBArray)
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.4695075 -0.1536033 -0.8322050 . -0.216530081 -0.820035789
## [2,] 0.4714332 1.1065538 0.8135870 . -1.267760754 -0.230147046
## [3,] -0.7638455 -1.3537554 -0.1561337 . -0.007176995 1.528866753
## [4,] -0.3068718 0.4714274 -0.4029580 . -0.336199112 0.112503679
## [5,] 0.9349748 -0.4700607 0.3885031 . -1.393425861 0.223380459
## ... . . . . . .
## [96,] 0.5279343 0.1023838 -1.7291151 . 0.2098264 1.7144877
## [97,] 2.0783636 0.9704089 0.5229847 . -0.8869093 -1.6604553
## [98,] 0.7390380 1.4580415 0.3324778 . 1.0397619 -0.8634210
## [99,] -1.2263313 -0.5689368 1.0927582 . -0.1971321 0.4283889
## [100,] 0.3854366 -0.6961820 -2.0814346 . -0.9528907 -1.9242366
Alternatively, we can use coercion methods:
as(X, "TileDBArray")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] 0.4695075 -0.1536033 -0.8322050 . -0.216530081 -0.820035789
## [2,] 0.4714332 1.1065538 0.8135870 . -1.267760754 -0.230147046
## [3,] -0.7638455 -1.3537554 -0.1561337 . -0.007176995 1.528866753
## [4,] -0.3068718 0.4714274 -0.4029580 . -0.336199112 0.112503679
## [5,] 0.9349748 -0.4700607 0.3885031 . -1.393425861 0.223380459
## ... . . . . . .
## [96,] 0.5279343 0.1023838 -1.7291151 . 0.2098264 1.7144877
## [97,] 2.0783636 0.9704089 0.5229847 . -0.8869093 -1.6604553
## [98,] 0.7390380 1.4580415 0.3324778 . 1.0397619 -0.8634210
## [99,] -1.2263313 -0.5689368 1.0927582 . -0.1971321 0.4283889
## [100,] 0.3854366 -0.6961820 -2.0814346 . -0.9528907 -1.9242366
This process works also for sparse matrices:
Y <- Matrix::rsparsematrix(1000, 1000, density=0.01)
writeTileDBArray(Y)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] 0 0 0 . 0 0
## [2,] 0 0 0 . 0 0
## [3,] 0 0 0 . 0 0
## [4,] 0 0 0 . 0 0
## [5,] 0 0 0 . 0 0
## ... . . . . . .
## [996,] 0 0 0 . 0 0
## [997,] 0 0 0 . 0 0
## [998,] 0 0 0 . 0 0
## [999,] 0 0 0 . 0 0
## [1000,] 0 0 0 . 0 0
Logical and integer matrices are supported:
writeTileDBArray(Y > 0)
## <1000 x 1000> sparse matrix of class TileDBMatrix and type "logical":
## [,1] [,2] [,3] ... [,999] [,1000]
## [1,] FALSE FALSE FALSE . FALSE FALSE
## [2,] FALSE FALSE FALSE . FALSE FALSE
## [3,] FALSE FALSE FALSE . FALSE FALSE
## [4,] FALSE FALSE FALSE . FALSE FALSE
## [5,] FALSE FALSE FALSE . FALSE FALSE
## ... . . . . . .
## [996,] FALSE FALSE FALSE . FALSE FALSE
## [997,] FALSE FALSE FALSE . FALSE FALSE
## [998,] FALSE FALSE FALSE . FALSE FALSE
## [999,] FALSE FALSE FALSE . FALSE FALSE
## [1000,] FALSE FALSE FALSE . FALSE FALSE
As are matrices with dimension names:
rownames(X) <- sprintf("GENE_%i", seq_len(nrow(X)))
colnames(X) <- sprintf("SAMP_%i", seq_len(ncol(X)))
writeTileDBArray(X)
## <100 x 10> matrix of class TileDBMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.4695075 -0.1536033 -0.8322050 . -0.216530081 -0.820035789
## GENE_2 0.4714332 1.1065538 0.8135870 . -1.267760754 -0.230147046
## GENE_3 -0.7638455 -1.3537554 -0.1561337 . -0.007176995 1.528866753
## GENE_4 -0.3068718 0.4714274 -0.4029580 . -0.336199112 0.112503679
## GENE_5 0.9349748 -0.4700607 0.3885031 . -1.393425861 0.223380459
## ... . . . . . .
## GENE_96 0.5279343 0.1023838 -1.7291151 . 0.2098264 1.7144877
## GENE_97 2.0783636 0.9704089 0.5229847 . -0.8869093 -1.6604553
## GENE_98 0.7390380 1.4580415 0.3324778 . 1.0397619 -0.8634210
## GENE_99 -1.2263313 -0.5689368 1.0927582 . -0.1971321 0.4283889
## GENE_100 0.3854366 -0.6961820 -2.0814346 . -0.9528907 -1.9242366
TileDBArray
sTileDBArray
s are simply DelayedArray
objects and can be manipulated as such.
The usual conventions for extracting data from matrix-like objects work as expected:
out <- as(X, "TileDBArray")
dim(out)
## [1] 100 10
head(rownames(out))
## [1] "GENE_1" "GENE_2" "GENE_3" "GENE_4" "GENE_5" "GENE_6"
head(out[,1])
## [1] 0.4695075 0.4714332 -0.7638455 -0.3068718 0.9349748 0.8697633
We can also perform manipulations like subsetting and arithmetic.
Note that these operations do not affect the data in the TileDB backend;
rather, they are delayed until the values are explicitly required,
hence the creation of the DelayedMatrix
object.
out[1:5,1:5]
## <5 x 5> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5
## GENE_1 0.4695075 -0.1536033 -0.8322050 0.7789347 1.4411733
## GENE_2 0.4714332 1.1065538 0.8135870 1.0687610 -0.4234327
## GENE_3 -0.7638455 -1.3537554 -0.1561337 0.1199645 -0.6845167
## GENE_4 -0.3068718 0.4714274 -0.4029580 1.4858483 1.3235803
## GENE_5 0.9349748 -0.4700607 0.3885031 -0.1659683 -1.9235338
out * 2
## <100 x 10> matrix of class DelayedMatrix and type "double":
## SAMP_1 SAMP_2 SAMP_3 ... SAMP_9 SAMP_10
## GENE_1 0.9390150 -0.3072067 -1.6644101 . -0.43306016 -1.64007158
## GENE_2 0.9428664 2.2131076 1.6271740 . -2.53552151 -0.46029409
## GENE_3 -1.5276909 -2.7075109 -0.3122675 . -0.01435399 3.05773351
## GENE_4 -0.6137436 0.9428549 -0.8059159 . -0.67239822 0.22500736
## GENE_5 1.8699496 -0.9401214 0.7770062 . -2.78685172 0.44676092
## ... . . . . . .
## GENE_96 1.0558686 0.2047677 -3.4582303 . 0.4196528 3.4289754
## GENE_97 4.1567271 1.9408179 1.0459693 . -1.7738187 -3.3209106
## GENE_98 1.4780760 2.9160830 0.6649556 . 2.0795239 -1.7268419
## GENE_99 -2.4526625 -1.1378736 2.1855164 . -0.3942643 0.8567777
## GENE_100 0.7708731 -1.3923641 -4.1628691 . -1.9057814 -3.8484732
We can also do more complex matrix operations that are supported by DelayedArray:
colSums(out)
## SAMP_1 SAMP_2 SAMP_3 SAMP_4 SAMP_5 SAMP_6 SAMP_7
## 2.3190548 -3.1883724 14.0665424 0.6398062 0.6598219 6.2458670 2.5465798
## SAMP_8 SAMP_9 SAMP_10
## -0.7081215 -1.8437946 3.6637104
out %*% runif(ncol(out))
## <100 x 1> matrix of class DelayedMatrix and type "double":
## y
## GENE_1 0.6881455
## GENE_2 -2.3869472
## GENE_3 -0.8666771
## GENE_4 1.8945255
## GENE_5 -3.5186043
## ... .
## GENE_96 1.9499076
## GENE_97 -0.6950101
## GENE_98 0.2777651
## GENE_99 -0.6061603
## GENE_100 -5.1366841
We can adjust some parameters for creating the backend with appropriate arguments to writeTileDBArray()
.
For example, the example below allows us to control the path to the backend
as well as the name of the attribute containing the data.
X <- matrix(rnorm(1000), ncol=10)
path <- tempfile()
writeTileDBArray(X, path=path, attr="WHEE")
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.21130651 -2.32981644 1.13528679 . 0.46050819 -0.18808307
## [2,] 0.93485724 0.65682005 -0.07566573 . 0.35540957 0.78044244
## [3,] 0.88793409 -0.28957070 -1.21296278 . -0.01649357 0.15664396
## [4,] 1.82639416 0.12663312 -0.03304823 . -0.31730696 -0.92544908
## [5,] 1.51023744 -0.39219455 -0.61895208 . 0.77813708 -1.43326219
## ... . . . . . .
## [96,] 0.66982467 -0.05857117 1.38288407 . -0.1227409 -1.6044565
## [97,] 0.37571997 -0.62590522 0.50470006 . 0.4817874 1.2799410
## [98,] -1.17677924 -2.13071217 1.45959223 . -0.9470521 -2.6803048
## [99,] -0.37855747 0.84210955 0.05854815 . -0.2582330 0.7997802
## [100,] 0.41686218 0.03320671 1.23553003 . 0.5419969 2.1028142
As these arguments cannot be passed during coercion, we instead provide global variables that can be set or unset to affect the outcome.
path2 <- tempfile()
setTileDBPath(path2)
as(X, "TileDBArray") # uses path2 to store the backend.
## <100 x 10> matrix of class TileDBMatrix and type "double":
## [,1] [,2] [,3] ... [,9] [,10]
## [1,] -0.21130651 -2.32981644 1.13528679 . 0.46050819 -0.18808307
## [2,] 0.93485724 0.65682005 -0.07566573 . 0.35540957 0.78044244
## [3,] 0.88793409 -0.28957070 -1.21296278 . -0.01649357 0.15664396
## [4,] 1.82639416 0.12663312 -0.03304823 . -0.31730696 -0.92544908
## [5,] 1.51023744 -0.39219455 -0.61895208 . 0.77813708 -1.43326219
## ... . . . . . .
## [96,] 0.66982467 -0.05857117 1.38288407 . -0.1227409 -1.6044565
## [97,] 0.37571997 -0.62590522 0.50470006 . 0.4817874 1.2799410
## [98,] -1.17677924 -2.13071217 1.45959223 . -0.9470521 -2.6803048
## [99,] -0.37855747 0.84210955 0.05854815 . -0.2582330 0.7997802
## [100,] 0.41686218 0.03320671 1.23553003 . 0.5419969 2.1028142
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows Server 2012 R2 x64 (build 9600)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=C
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] TileDBArray_1.0.0 DelayedArray_0.16.0 IRanges_2.24.0
## [4] S4Vectors_0.28.0 MatrixGenerics_1.2.0 matrixStats_0.57.0
## [7] BiocGenerics_0.36.0 Matrix_1.2-18 BiocStyle_2.18.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.5 RcppCCTZ_0.2.9 knitr_1.30
## [4] magrittr_1.5 bit_4.0.4 nanotime_0.3.2
## [7] lattice_0.20-41 rlang_0.4.8 stringr_1.4.0
## [10] tools_4.0.3 grid_4.0.3 xfun_0.18
## [13] tiledb_0.8.2 htmltools_0.5.0 bit64_4.0.5
## [16] yaml_2.2.1 digest_0.6.27 bookdown_0.21
## [19] BiocManager_1.30.10 evaluate_0.14 rmarkdown_2.5
## [22] stringi_1.5.3 compiler_4.0.3 zoo_1.8-8