netDx 1.4.3
This code block is not evaluated. Need a breakdown? Look at the following sections.
suppressWarnings(suppressMessages(require(netDx)))
suppressWarnings(suppressMessages(library(curatedTCGAData)))
# fetch data remotely
brca <- suppressMessages(curatedTCGAData("BRCA",c("mRNAArray"),FALSE,
version="1.1.38"))
# process input variables
staget <- sub("[abcd]","",sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget
pam50 <- colData(brca)$PAM50.mRNA
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50
tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like","Luminal B","HER2-enriched")),
which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]
smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
# remove duplicate assays mapped to the same sample
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])
# colData must have ID and STATUS columns
pID <- colData(brca)$patientID
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod
# create grouping rules
groupList <- list()
# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
age="patient.age_at_initial_pathologic_diagnosis",
stage="STAGE"
)
# create function to tell netDx how to build features (PSN) from your data
makeNets <- function(dataList, groupList, netDir,...) {
netList <- c() # initialize before is.null() check
# make RNA nets (NOTE: the check for is.null() is important!)
# (Pearson correlation)
if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) {
netList <- makePSN_NamedMatrix(dataList[["BRCA_mRNAArray-20160128"]],
rownames(dataList[["BRCA_mRNAArray-20160128"]]),
groupList[["BRCA_mRNAArray-20160128"]],
netDir,verbose=FALSE,
writeProfiles=TRUE,...)
}
# make clinical nets (normalized difference)
netList2 <- c()
if (!is.null(groupList[["clinical"]])) {
netList2 <- makePSN_NamedMatrix(dataList$clinical,
rownames(dataList$clinical),
groupList[["clinical"]],netDir,
simMetric="custom",customFunc=normDiff, # custom function
writeProfiles=FALSE,
sparsify=TRUE,verbose=TRUE,...)
}
netList <- c(unlist(netList),unlist(netList2))
return(netList)
}
# run predictor
set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
"pred_output",sep=getFileSep())
# To see all messages, remove suppressMessages() and set logging="default".
# To keep all intermediate data, set keepAllData=TRUE
out <- buildPredictor(
dataList=brca,groupList=groupList,
makeNetFunc=makeNets,
outDir=outDir, ## netDx requires absolute path
numSplits=2L,featScoreMax=2L, featSelCutoff=1L,
numCores=1L,logging="none",
keepAllData=FALSE,debugMode=TRUE
)
# collect results
numSplits <- 2
st <- unique(colData(brca)$STATUS)
acc <- c() # accuracy
predList <- list() # prediction tables
featScores <- list() # feature scores per class
for (cur in unique(st)) featScores[[cur]] <- list()
for (k in 1:numSplits) {
pred <- out[[sprintf("Split%i",k)]][["predictions"]];
# predictions table
tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
sprintf("%s_SCORE",st))]
predList[[k]] <- tmp
# accuracy
acc <- c(acc, sum(tmp$PRED==tmp$STATUS)/nrow(tmp))
# feature scores
for (cur in unique(st)) {
tmp <- out[[sprintf("Split%i",k)]][["featureScores"]][[cur]]
colnames(tmp) <- c("PATHWAY_NAME","SCORE")
featScores[[cur]][[sprintf("Split%i",k)]] <- tmp
}
}
# plot ROC and PR curve, compute AUROC, AUPR
predPerf <- plotPerf(predList, predClasses=st)
# get table of feature scores for each split and patient label
featScores2 <- lapply(featScores, getNetConsensus)
# identify features that consistently perform well
featSelNet <- lapply(featScores2, function(x) {
callFeatSel(x, fsCutoff=1, fsPctPass=0)
})
# prepare data for EnrichmentMap plotting of top-scoring features
Emap_res <- getEMapInput_many(featScores2,pathList,
minScore=1,maxScore=2,pctPass=0,out$inputNets,verbose=FALSE)
gmtFiles <- list()
nodeAttrFiles <- list()
for (g in names(Emap_res)) {
outFile <- paste(outDir,sprintf("%s_nodeAttrs.txt",g),sep=getFileSep())
write.table(Emap_res[[g]][["nodeAttrs"]],file=outFile,
sep="\t",col=TRUE,row=FALSE,quote=FALSE)
nodeAttrFiles[[g]] <- outFile
outFile <- paste(outDir,sprintf("%s.gmt",g),sep=getFileSep())
conn <- suppressWarnings(
suppressMessages(base::file(outFile,"w")))
tmp <- Emap_res[[g]][["featureSets"]]
gmtFiles[[g]] <- outFile
for (cur in names(tmp)) {
curr <- sprintf("%s\t%s\t%s", cur,cur,
paste(tmp[[cur]],collapse="\t"))
writeLines(curr,con=conn)
}
close(conn)
}
# This step requires Cytoscape to be installed and running.
###plotEmap(gmtFiles[[1]],nodeAttrFiles[[1]],
### groupClusters=TRUE, hideNodeLabels=TRUE)
# collect data for integrated PSN
featScores2 <- lapply(featScores, getNetConsensus)
featSelNet <- lapply(featScores2, function(x) {
callFeatSel(x, fsCutoff=2, fsPctPass=1)
})
topPath <- gsub(".profile","",
unique(unlist(featSelNet)))
topPath <- gsub("_cont.txt","",topPath)
# create groupList limited to top features
g2 <- list();
for (nm in names(groupList)) {
cur <- groupList[[nm]]
idx <- which(names(cur) %in% topPath)
message(sprintf("%s: %i pathways", nm, length(idx)))
if (length(idx)>0) g2[[nm]] <- cur[idx]
}
# calculates integrated PSN, calculates grouping statistics,
# and plots integrates PSN. Set plotCytoscape=TRUE if Cytoscape is running.
psn <- suppressMessages(
plotIntegratedPatientNetwork(brca,
groupList=g2, makeNetFunc=makeNets,
aggFun="MEAN",prune_X=0.30,prune_useTop=TRUE,
numCores=1L,calcShortestPath=TRUE,
showStats=FALSE,
verbose=FALSE, plotCytoscape=FALSE)
)
# Visualize integrated patient similarity network as a tSNE plot
tsne <- plot_tSNE(psn$patientSimNetwork_unpruned,colData(brca))
In this example, we will build a binary breast tumour classifier from clinical data and gene expression data. We will use different rules to create features for each data layer. Specifically:
Feature scoring is performed over multiple random splits of the data into train and blind test partitions. Feature selected networks are those that consistently score highly across the multiple splits (e.g. those that score 9 out of 10 in >=70% of splits).
Conceptually, this is what the higher-level logic looks like for a cross-validation design. In the pseudocode example below, the predictor runs for 100 train/test splits. Within a split, features are scored from 0 to 10. Features scoring >=9 are used to predict labels on the held-out test set (20%).
(Note: these aren’t real function calls; this block just serves to illustrate the concept of the design for our purposes)
numSplits <- 100 # num times to split data into train/blind test samples
featScoreMax <- 10 # num folds for cross-validation, also max score for a network
featSelCutoff <- 9
netScores <- list() # collect <numSplits> set of netScores
perf <- list() # collect <numSplits> set of test evaluations
for k in 1:numSplits
[train, test] <- splitData(80:20) # split data using RNG seed
featScores[[k]] <- scoreFeatures(train, featScoreMax)
topFeat[[k]] <- applyFeatCutoff(featScores[[k]])
perf[[k]] <- collectPerformance(topFeat[[k]], test)
end
suppressWarnings(suppressMessages(require(netDx)))
In this example, we use curated data from The Cancer Genome Atlas, through the BioConductor curatedTCGAData
package. The goal is to classify a breast tumour into either a Luminal A subtype or otherwise. The predictor will integrate clinical variables selected by the user, along with gene expression data.
Here we load the required packages and download clinical and gene expression data.
suppressMessages(library(curatedTCGAData))
Take a look at the available data without downloading any:
curatedTCGAData(diseaseCode="BRCA", assays="*",dry.run=TRUE,
version="1.1.38")
## snapshotDate(): 2021-05-18
## See '?curatedTCGAData' for 'diseaseCode' and 'assays' inputs
## ah_id title file_size
## 1 EH584 BRCA_CNASeq-20160128 0 Mb
## 2 EH585 BRCA_CNASNP-20160128 9.8 Mb
## 3 EH586 BRCA_CNVSNP-20160128 2.8 Mb
## 4 EH588 BRCA_GISTIC_AllByGene-20160128 1.3 Mb
## 5 EH2121 BRCA_GISTIC_Peaks-20160128 0 Mb
## 6 EH589 BRCA_GISTIC_ThresholdedByGene-20160128 0.4 Mb
## 7 EH2122 BRCA_Methylation_methyl27-20160128_assays 63.2 Mb
## 8 EH2123 BRCA_Methylation_methyl27-20160128_se 0.4 Mb
## 9 EH2124 BRCA_Methylation_methyl450-20160128_assays 2613.2 Mb
## 10 EH2125 BRCA_Methylation_methyl450-20160128_se 6.1 Mb
## 11 EH593 BRCA_miRNASeqGene-20160128 0.6 Mb
## 12 EH594 BRCA_mRNAArray-20160128 27.3 Mb
## 13 EH595 BRCA_Mutation-20160128 4.5 Mb
## 14 EH596 BRCA_RNASeq2GeneNorm-20160128 64.5 Mb
## 15 EH597 BRCA_RNASeqGene-20160128 30 Mb
## 16 EH598 BRCA_RPPAArray-20160128 1.6 Mb
## rdataclass rdatadateadded rdatadateremoved
## 1 RaggedExperiment 2017-10-10 <NA>
## 2 RaggedExperiment 2017-10-10 <NA>
## 3 RaggedExperiment 2017-10-10 <NA>
## 4 SummarizedExperiment 2017-10-10 <NA>
## 5 RangedSummarizedExperiment 2019-01-09 <NA>
## 6 SummarizedExperiment 2017-10-10 <NA>
## 7 SummarizedExperiment 2019-01-09 <NA>
## 8 SummarizedExperiment 2019-01-09 <NA>
## 9 RaggedExperiment 2019-01-09 <NA>
## 10 SummarizedExperiment 2019-01-09 <NA>
## 11 SummarizedExperiment 2017-10-10 <NA>
## 12 SummarizedExperiment 2017-10-10 <NA>
## 13 RaggedExperiment 2017-10-10 <NA>
## 14 SummarizedExperiment 2017-10-10 <NA>
## 15 SummarizedExperiment 2017-10-10 <NA>
## 16 SummarizedExperiment 2017-10-10 <NA>
We will work only with the mRNA data in this example:
brca <- suppressMessages(curatedTCGAData("BRCA",c("mRNAArray"),FALSE,
version="1.1.38"))
This next code block prepares the TCGA data. In practice you would do this once, and save the data before running netDx, but we run it here to see an end-to-end example.
staget <- sub("[abcd]","",sub("t","",colData(brca)$pathology_T_stage))
staget <- suppressWarnings(as.integer(staget))
colData(brca)$STAGE <- staget
pam50 <- colData(brca)$PAM50.mRNA
pam50[which(!pam50 %in% "Luminal A")] <- "notLumA"
pam50[which(pam50 %in% "Luminal A")] <- "LumA"
colData(brca)$pam_mod <- pam50
tmp <- colData(brca)$PAM50.mRNA
idx <- union(which(tmp %in% c("Normal-like","Luminal B","HER2-enriched")),
which(is.na(staget)))
pID <- colData(brca)$patientID
tokeep <- setdiff(pID, pID[idx])
brca <- brca[,tokeep,]
# remove duplicate assays mapped to the same sample
smp <- sampleMap(brca)
samps <- smp[which(smp$assay=="BRCA_mRNAArray-20160128"),]
notdup <- samps[which(!duplicated(samps$primary)),"colname"]
brca[[1]] <- suppressMessages(brca[[1]][,notdup])
## harmonizing input:
## removing 44 sampleMap rows with 'colname' not in colnames of experiments
The important thing is to create ID
and STATUS
columns in the sample metadata table. netDx uses these to get the patient identifiers and labels, respectively.
pID <- colData(brca)$patientID
colData(brca)$ID <- pID
colData(brca)$STATUS <- colData(brca)$pam_mod
netDx allows the user to define a custom function that takes patient data and variable groupings as input, and returns a set of patient similarity networks (PSN) as output. The user can customize what datatypes are used, how they are grouped, and what defines patient similarity for a given datatype.
When running the predictor (next section), the user simply passes this custom function as an input variable; i.e. the makeNetFunc
parameter when calling buildPredictor()
.
Note: While netDx provides a high degree of flexibility in achieving your design of choice, it is up to the user to ensure that the design, i.e. the similarity metric and variable groupings, is appropriate for your application. Domain knowledge is almost likely required for good design.
netDx requires that this function take some generic parameters as input. These include:
dataList
: the patient data, provided as a MultiAssayExperiment
object. Refer to the tutorials for MultiAssayExperiment to see how to construct those objects from data.groupList
: sets of input data that would correspond to individual networks (e.g. genes grouped into pathways)netDir
: the directory where the resulting PSN would be stored.Here the BRCA data is already provided to us as a MultiAssayExperiment
object:
summary(brca)
## Length Class Mode
## 1 MultiAssayExperiment S4
This object tells the predictor how to group units when constructing a network. For examples, genes may be grouped into a network representing a pathway. This object is a list; the names match those of dataList
while each value is itself a list and reflects a potential network.
groupList <- list()
# genes in mRNA data are grouped by pathways
pathList <- readPathways(fetchPathwayDefinitions("January",2018))
## ---------------------------------------
## Fetching http://download.baderlab.org/EM_Genesets/January_01_2018/Human/symbol/Human_AllPathways_January_01_2018_symbol.gmt
## File: 3aeb7097bb288_Human_AllPathways_January_01_2018_symbol.gmt
## Read 3028 pathways in total, internal list has 3009 entries
## FILTER: sets with num genes in [10, 200]
## => 971 pathways excluded
## => 2038 left
groupList[["BRCA_mRNAArray-20160128"]] <- pathList[1:3]
# clinical data is not grouped; each variable is its own feature
groupList[["clinical"]] <- list(
age="patient.age_at_initial_pathologic_diagnosis",
stage="STAGE"
)
So the groupList
variable has one entry per data layer:
summary(groupList)
## Length Class Mode
## BRCA_mRNAArray-20160128 3 -none- list
## clinical 2 -none- list
Each entry contains a list, with one entry per feature. Here we have 3 pathway-level features for mRNA and two variable-level features for clinical data.
For example, here are the networks to be created with RNA data. Genes corresponding to pathways are to be grouped into individual network. Such a groupList would create pathway-level networks:
groupList[["BRCA_mRNAArray-20160128"]][1:3]
## $UREA_CYCLE
## [1] "SLC25A15" "CPS1" "ASL" "ARG2" "SLC25A2" "OTC"
## [7] "NMRAL1" "NAGS" "ASS1" "ARG1"
##
## $`CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I`
## [1] "AGPAT1" "GPD2" "ABHD5" "GPAT2" "CDS1" "LPCAT3" "LPCAT4" "CDS2"
## [9] "AGPAT6" "AGPAT5" "MBOAT7" "AGPAT9" "LCLAT1" "MBOAT2" "AGPAT4" "GPAM"
## [17] "AGPAT3" "AGPAT2"
##
## $`SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM`
## [1] "IPMK" "INPP5B" "INPP5F" "INPP5D" "MINPP1" "INPP5A" "ITPKA" "OCRL"
## [9] "ITPKC" "ITPKB" "SYNJ2" "INPP5J" "INPP5K" "PTEN" "IMPA2" "INPP1"
## [17] "SYNJ1" "INPPL1" "IMPA1" "IMPAD1"
For clinical data, we want to keep each variable as its own network:
head(groupList[["clinical"]])
## $age
## [1] "patient.age_at_initial_pathologic_diagnosis"
##
## $stage
## [1] "STAGE"
This function is defined by the user and tells the predictor how to create networks from the provided input data.
This function requires dataList
,groupList
, and netDir
as input variables. The residual ...
parameter is to pass additional variables to makePSN_NamedMatrix()
, notably numCores
(number of parallel jobs).
In this particular example, the custom similarity function does the following:
makePSN_NamedMatrix(writeProfiles=TRUE,...)
makePSN_NamedMatrix(writeProfiles=FALSE,simMetric="custom",customFunc=normDiff)
.makeNets <- function(dataList, groupList, netDir,...) {
netList <- c() # initialize before is.null() check
# make RNA nets (NOTE: the check for is.null() is important!)
# (Pearson correlation)
if (!is.null(groupList[["BRCA_mRNAArray-20160128"]])) {
netList <- makePSN_NamedMatrix(dataList[["BRCA_mRNAArray-20160128"]],
rownames(dataList[["BRCA_mRNAArray-20160128"]]),
groupList[["BRCA_mRNAArray-20160128"]],
netDir,verbose=FALSE,
writeProfiles=TRUE,...)
}
# make clinical nets (normalized difference)
netList2 <- c()
if (!is.null(groupList[["clinical"]])) {
netList2 <- makePSN_NamedMatrix(dataList$clinical,
rownames(dataList$clinical),
groupList[["clinical"]],netDir,
simMetric="custom",customFunc=normDiff, # custom function
writeProfiles=FALSE,
sparsify=TRUE,verbose=TRUE,...)
}
netList <- c(unlist(netList),unlist(netList2))
return(netList)
}
Note: dataList
and groupList
are generic containers that can contain whatever object the user requires to create PSN. The custom function gives the user complete flexibility in feature design.
Finally we call the function that runs the netDx predictor. We provide:
numSplits
,featScoreMax
, set to 10)featSelCutoff
); only features scoring this value or higher will be used to classify test patients, anddataList
), how variables are to be grouped into networks (groupList
) and the custom function to generate features (makeNetFunc
).Change numCores
to match the number of cores available on your machine for
parallel processing.
The call below runs 2 train/test splits. Within each split, it:
featScoreMax=2
)featSelCutoff
) to classify test samples for that split.These are unrealistically low values set so the example will run fast. In practice a good starting point is featScoreMax=10
, featSelCutoff=9
and numSplits=100
, but these parameters depend on the sample sizes in the dataset and heterogeneity of the samples.
set.seed(42) # make results reproducible
outDir <- paste(tempdir(),randAlphanumString(),
"pred_output",sep=getFileSep())
# set keepAllData=TRUE to not delete at the end of the predictor run.
# This can be useful for debugging.
out <- buildPredictor(
dataList=brca,groupList=groupList,
makeNetFunc=makeNets,
outDir=outDir, ## netDx requires absolute path
numSplits=2L,featScoreMax=2L,
featSelCutoff=1L,
numCores=1L,debugMode=TRUE,
logging="none")
## Predictor started at:
## 2021-08-19 07:53:15
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/tmp/test.log
## Scoring features
## Java 11 or later detected
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/GM_results/CV_2.query --netdx-flag true
## QueryRunner time taken: 1.3 s
## Scoring features
## Java 11 or later detected
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/GM_results/CV_2.query --netdx-flag true
## QueryRunner time taken: 1.2 s
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/tmp/test.log
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/LumA/LumA_query --netdx-flag true
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/tmp/test.log
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng1/notLumA/notLumA_query --netdx-flag true
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/tmp/test.log
## Scoring features
## Java 11 or later detected
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/GM_results/CV_2.query --netdx-flag true
## QueryRunner time taken: 1.5 s
## Scoring features
## Java 11 or later detected
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results/CV_1.query /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/GM_results/CV_2.query --netdx-flag true
## QueryRunner time taken: 1.3 s
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/tmp/test.log
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/LumA/LumA_query --netdx-flag true
## Pearson similarity chosen - enforcing min. 5 patients per net.
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.1.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.1.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.2.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.2.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## Making Java call
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.core.evaluation.ProfileToNetworkDriver -proftype continuous -cor PEARSON -threshold off -maxmissing 100.0 -in /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/profiles/1.3.profile -out /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS/1.3.txt -syn /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/1.synonyms -keepAllTies -limitTies
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.mediator.lucene.exporter.Generic2LuceneExporter /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/db.cfg /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/colours.txt
## java -Xmx10G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.engine.apps.CacheBuilder -cachedir cache -indexDir . -networkDir /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/INTERACTIONS -log /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/tmp/test.log
## java -Xmx4G -cp ~/.cache/netDx/3aeb7035c2b175_genemania-netdx.jar org.genemania.plugin.apps.QueryRunner --data /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/dataset --in flat --out flat --threads 1 --results /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA /tmp/RtmpfpXO2u/QEAYJ1252R/pred_output/rng2/notLumA/notLumA_query --netdx-flag true
## Predictor completed at:
## 2021-08-19 07:58:44
The results are stored in the list object returned by the buildPredictor()
call.
This list contains:
inputNets
: all input networks that the model started with.Split<i>
: a list with results for each train-test split
predictions
: real and predicted labels for test patientsaccuracy
: percent accuracy of predictionsfeatureScores
: feature scores for each label (list with g
entries, where g
is number of patient labels). Each entry contains the feature selection scores for the corresponding label.featureSelected
: vector of features that pass feature selection. List of length g
, with one entry per label.summary(out)
## Length Class Mode
## inputNets 10 -none- character
## Split1 4 -none- list
## Split2 4 -none- list
summary(out$Split1)
## Length Class Mode
## featureScores 2 -none- list
## featureSelected 2 -none- list
## predictions 2692 data.frame list
## accuracy 1 -none- numeric
This code collects different components of model output to examine the results.
numSplits <- 2
st <- unique(colData(brca)$STATUS)
acc <- c() # accuracy
predList <- list() # prediction tables
featScores <- list() # feature scores per class
for (cur in unique(st)) featScores[[cur]] <- list()
for (k in 1:numSplits) {
pred <- out[[sprintf("Split%i",k)]][["predictions"]];
# predictions table
tmp <- pred[,c("ID","STATUS","TT_STATUS","PRED_CLASS",
sprintf("%s_SCORE",st))]
predList[[k]] <- tmp
# accuracy
acc <- c(acc, sum(tmp$PRED==tmp$STATUS)/nrow(tmp))
# feature scores
for (cur in unique(st)) {
tmp <- out[[sprintf("Split%i",k)]][["featureScores"]][[cur]]
colnames(tmp) <- c("PATHWAY_NAME","SCORE")
featScores[[cur]][[sprintf("Split%i",k)]] <- tmp
}
}
After compiling the data above, plot accuracy for each train/test split:
print(acc)
## [1] 0.8208955 0.8208955
Create a ROC curve, a precision-recall curve, and plot average AUROC and AUPR:
predPerf <- plotPerf(predList, predClasses=st)
Use getNetConsensus()
to convert the list data structure into a single table, one per patient label. The rows show train/test splits and the columns show features that consistently perform well.
We then use callFeatSel()
to identify features that consistently perform well across the various train/test splits. Because this is a toy example, we set the bar very low to get some features. Here we accept a feature if it scores 1 or higher (fsCutoff=1
) in even one split (fsPctPass=0.05
), setting the latter to a low positive fraction.
featScores2 <- lapply(featScores, getNetConsensus)
summary(featScores2)
## Length Class Mode
## LumA 3 data.frame list
## notLumA 3 data.frame list
head(featScores2[["LumA"]])
## PATHWAY_NAME
## 1 CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I.profile
## 2 SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM.profile
## 3 UREA_CYCLE.profile
## 4 age_cont.txt
## 5 stage_cont.txt
## Split1 Split2
## 1 2 2
## 2 2 2
## 3 2 2
## 4 1 1
## 5 NA 1
In practice, a recommended setting is fsCutoff=9
and fsPctPass=0.7
to get features that score at least 9 (out of 10) in at least 70% of the train/test splits.
featSelNet <- lapply(featScores2, function(x) {
callFeatSel(x, fsCutoff=1, fsPctPass=0)
})
print(head(featScores2[["LumA"]]))
## PATHWAY_NAME
## 1 CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I.profile
## 2 SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM.profile
## 3 UREA_CYCLE.profile
## 4 age_cont.txt
## 5 stage_cont.txt
## Split1 Split2
## 1 2 2
## 2 2 2
## 3 2 2
## 4 1 1
## 5 NA 1
An EnrichmentMap is a network-based visualization of pathway connectivity and is used in netDx to visualize themes in predictive pathway-based features. It is used in conjunction with AutoAnnotate to identify clusters, and apply auto-generated labels to these. For more information, see the EnrichmentMap website at baderlab.org.
Use getEMapInput_many()
to create the input that helps generate the EnrichmentMap in Cytoscape.
Emap_res <- getEMapInput_many(featScores2,pathList,
minScore=1,maxScore=2,pctPass=0,out$inputNets,verbose=FALSE)
Write the results to files that Cytoscape can read in:
gmtFiles <- list()
nodeAttrFiles <- list()
for (g in names(Emap_res)) {
outFile <- paste(outDir,sprintf("%s_nodeAttrs.txt",g),sep=getFileSep())
write.table(Emap_res[[g]][["nodeAttrs"]],file=outFile,
sep="\t",col=TRUE,row=FALSE,quote=FALSE)
nodeAttrFiles[[g]] <- outFile
outFile <- paste(outDir,sprintf("%s.gmt",g),sep=getFileSep())
conn <- suppressWarnings(
suppressMessages(base::file(outFile,"w")))
tmp <- Emap_res[[g]][["featureSets"]]
gmtFiles[[g]] <- outFile
for (cur in names(tmp)) {
curr <- sprintf("%s\t%s\t%s", cur,cur,
paste(tmp[[cur]],collapse="\t"))
writeLines(curr,con=conn)
}
close(conn)
}
Finally, plot the EnrichmentMap. This step requires Cytoscape to be installed, along with the EnrichmentMap and AutoAnnotate apps. It also requires the Cytoscape application to be open and running on the machine running the code. This block is commented out for automatic builds on BioConductor, but a screenshot of the intended result is shown below.
###plotEmap(gmtFiles[[1]],nodeAttrFiles[[1]],
### groupClusters=TRUE, hideNodeLabels=TRUE)
This example EnrichmentMap isn’t terribly exciting because of the low number of pathway features permitted, the upper bound on feature selection scores and low number of train/test splits. But hopefully it serves its purpose to be illustrative.
Here is an example of an EnrichmentMap generated by running the above predictor with more real-world parameter values, and all available pathways:
We can apply a threshold to define predictive features, and integrate these into a single patient similarity network. Such a network is useful for downstream operations such as ascertaining whether or not classes are significantly separated and visualization.
Here we define predictive features as those scoring 3 out of 3 in all train/test splits.
featScores2 <- lapply(featScores, getNetConsensus)
featSelNet <- lapply(featScores2, function(x) {
callFeatSel(x, fsCutoff=2, fsPctPass=1)
})
We can examine the features:
print(featSelNet)
## $LumA
## [1] "CDP-DIACYLGLYCEROL_BIOSYNTHESIS_I.profile"
## [2] "SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM.profile"
## [3] "UREA_CYCLE.profile"
##
## $notLumA
## [1] "SUPERPATHWAY_OF_D-_I_MYO__I_-INOSITOL__1.4.5_-TRISPHOSPHATE_METABOLISM.profile"
## [2] "UREA_CYCLE.profile"
Create a new groupList
limited to top features:
topPath <- gsub(".profile","",
unique(unlist(featSelNet)))
topPath <- gsub("_cont.txt","",topPath)
# create groupList limited to top features
g2 <- list();
for (nm in names(groupList)) {
cur <- groupList[[nm]]
idx <- which(names(cur) %in% topPath)
message(sprintf("%s: %i pathways", nm, length(idx)))
if (length(idx)>0) g2[[nm]] <- cur[idx]
}
## BRCA_mRNAArray-20160128: 3 pathways
## clinical: 0 pathways
Plot the integrated patient network based on the features selected above. Note that at this stage, the similarity measure is inverted into a dissimilarity measure so that nodes with greater similarity are closer (have smaller distance or dissimilarity) in the final network.
In the example below, the networks are integrated by taking the mean of the edge weights (aggFun="MEAN"
). For the plotting we retain only the top 5% of the strongest edges (topX=0.05
).
By setting calcShortestPath=TRUE
, the function will also compute the pairwise shortest path for within- and across-group nodes. The result is shown as a set of violin plots and a one-sided Wilcoxon-Mann-Whitney test is used to assign significance.
As with plotEMap()
, this method must be run on a computer with Cytoscape installed and running. For the purposes of this example, plotCytoscape
is set to FALSE
and a screenshot of the resulting network is provided below. To plot in Cytoscape, set plotCytoscape=TRUE
.
psn <- suppressMessages(
plotIntegratedPatientNetwork(brca,
groupList=g2, makeNetFunc=makeNets,
aggFun="MEAN",prune_pctX=0.30,prune_useTop=TRUE,
numCores=1L,calcShortestPath=TRUE,
showStats=FALSE,
verbose=FALSE, plotCytoscape=FALSE)
)
## Warning in dir.create(paste(netDir, "profiles", sep = fsep)): '/tmp/RtmpfpXO2u/
## profiles' already exists
The integrated PSN can also be visualized as a tSNE plot:
tsne <- plot_tSNE(psn$patientSimNetwork_unpruned,colData(brca))
## * Making symmetric matrix
## * Running tSNE
## * Plotting
summary(tsne)
## Length Class Mode
## N 1 -none- numeric
## Y 662 -none- numeric
## costs 331 -none- numeric
## itercosts 20 -none- numeric
## origD 1 -none- numeric
## perplexity 1 -none- numeric
## theta 1 -none- numeric
## max_iter 1 -none- numeric
## stop_lying_iter 1 -none- numeric
## mom_switch_iter 1 -none- numeric
## momentum 1 -none- numeric
## final_momentum 1 -none- numeric
## eta 1 -none- numeric
## exaggeration_factor 1 -none- numeric
class(tsne)
## [1] "list"
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.13-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.13-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=C LC_NUMERIC=C
## [3] LC_TIME=C LC_COLLATE=C
## [5] LC_MONETARY=C LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] curatedTCGAData_1.14.0 MultiAssayExperiment_1.18.0
## [3] SummarizedExperiment_1.22.0 GenomicRanges_1.44.0
## [5] GenomeInfoDb_1.28.1 IRanges_2.26.0
## [7] S4Vectors_0.30.0 MatrixGenerics_1.4.2
## [9] matrixStats_0.60.0 netDx_1.4.3
## [11] bigmemory_4.5.36 Biobase_2.52.0
## [13] BiocGenerics_0.38.0 BiocStyle_2.20.2
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.2 R.utils_2.10.1
## [3] tidyselect_1.1.1 RSQLite_2.2.7
## [5] AnnotationDbi_1.54.1 grid_4.1.0
## [7] combinat_0.0-8 BiocParallel_1.26.1
## [9] Rtsne_0.15 RNeXML_2.4.5
## [11] munsell_0.5.0 ScaledMatrix_1.0.0
## [13] base64url_1.4 codetools_0.2-18
## [15] pbdZMQ_0.3-5 withr_2.4.2
## [17] colorspace_2.0-2 filelock_1.0.2
## [19] highr_0.9 knitr_1.33
## [21] dplR_1.7.2 uuid_0.1-4
## [23] zinbwave_1.14.1 SingleCellExperiment_1.14.1
## [25] ROCR_1.0-11 NMF_0.23.0
## [27] labeling_0.4.2 repr_1.1.3
## [29] GenomeInfoDbData_1.2.6 farver_2.1.0
## [31] bit64_4.0.5 rhdf5_2.36.0
## [33] vctrs_0.3.8 generics_0.1.0
## [35] xfun_0.25 BiocFileCache_2.0.0
## [37] R6_2.5.0 doParallel_1.0.16
## [39] ggbeeswarm_0.6.0 netSmooth_1.12.0
## [41] rsvd_1.0.5 RJSONIO_1.3-1.5
## [43] locfit_1.5-9.4 bitops_1.0-7
## [45] rhdf5filters_1.4.0 cachem_1.0.5
## [47] DelayedArray_0.18.0 assertthat_0.2.1
## [49] promises_1.2.0.1 scales_1.1.1
## [51] beeswarm_0.4.0 gtable_0.3.0
## [53] phylobase_0.8.10 beachmat_2.8.1
## [55] rlang_0.4.11 genefilter_1.74.0
## [57] splines_4.1.0 lazyeval_0.2.2
## [59] BiocManager_1.30.16 yaml_2.2.1
## [61] reshape2_1.4.4 backports_1.2.1
## [63] httpuv_1.6.2 tools_4.1.0
## [65] bookdown_0.23 gridBase_0.4-7
## [67] ggplot2_3.3.5 ellipsis_0.3.2
## [69] jquerylib_0.1.4 RColorBrewer_1.1-2
## [71] Rcpp_1.0.7 plyr_1.8.6
## [73] base64enc_0.1-3 sparseMatrixStats_1.4.2
## [75] progress_1.2.2 zlibbioc_1.38.0
## [77] purrr_0.3.4 RCurl_1.98-1.4
## [79] prettyunits_1.1.1 viridis_0.6.1
## [81] cluster_2.1.2 magrittr_2.0.1
## [83] magick_2.7.3 data.table_1.14.0
## [85] mime_0.11 hms_1.1.0
## [87] evaluate_0.14 xtable_1.8-4
## [89] XML_3.99-0.7 gridExtra_2.3
## [91] shape_1.4.6 compiler_4.1.0
## [93] scater_1.20.1 tibble_3.1.3
## [95] RCy3_2.12.4 crayon_1.4.1
## [97] R.oo_1.24.0 htmltools_0.5.1.1
## [99] entropy_1.3.0 later_1.3.0
## [101] tidyr_1.1.3 howmany_0.3-1
## [103] DBI_1.1.1 ExperimentHub_2.0.0
## [105] dbplyr_2.1.1 MASS_7.3-54
## [107] rappdirs_0.3.3 Matrix_1.3-4
## [109] ade4_1.7-17 uchardet_1.1.0
## [111] R.methodsS3_1.8.1 igraph_1.2.6
## [113] pkgconfig_2.0.3 bigmemory.sri_0.1.3
## [115] rncl_0.8.4 registry_0.5-1
## [117] locfdr_1.1-8 signal_0.7-7
## [119] IRdisplay_1.0 scuttle_1.2.1
## [121] xml2_1.3.2 foreach_1.5.1
## [123] annotate_1.70.0 vipor_0.4.5
## [125] bslib_0.2.5.1 rngtools_1.5
## [127] pkgmaker_0.32.2 XVector_0.32.0
## [129] stringr_1.4.0 digest_0.6.27
## [131] pracma_2.3.3 graph_1.70.0
## [133] softImpute_1.4-1 Biostrings_2.60.2
## [135] rmarkdown_2.10 edgeR_3.34.0
## [137] DelayedMatrixStats_1.14.2 curl_4.3.2
## [139] kernlab_0.9-29 shiny_1.6.0
## [141] lifecycle_1.0.0 nlme_3.1-152
## [143] jsonlite_1.7.2 clusterExperiment_2.12.0
## [145] Rhdf5lib_1.14.2 BiocNeighbors_1.10.0
## [147] viridisLite_0.4.0 limma_3.48.3
## [149] fansi_0.5.0 pillar_1.6.2
## [151] lattice_0.20-44 KEGGREST_1.32.0
## [153] fastmap_1.1.0 httr_1.4.2
## [155] survival_3.2-12 interactiveDisplayBase_1.30.0
## [157] glue_1.4.2 png_0.1-7
## [159] iterators_1.0.13 BiocVersion_3.13.1
## [161] glmnet_4.1-2 bit_4.0.4
## [163] stringi_1.7.3 sass_0.4.0
## [165] HDF5Array_1.20.0 blob_1.2.2
## [167] AnnotationHub_3.0.1 BiocSingular_1.8.1
## [169] memoise_2.0.0 IRkernel_1.2
## [171] dplyr_1.0.7 irlba_2.3.3
## [173] ape_5.5