!!!Caution work in progress!!!
Function optimizes Extraction windows so we have the same number of precursor per window. To do it uses spectral library or nonredundant blib.
specL contains a function specL::cdsw
.
## Loading required package: DBI
## Loading required package: protViz
## Loading required package: RSQLite
## Loading required package: seqinr
quantile
# moves the windows start and end to regions where no peaks are observed
.makenewfromto <- function( windfrom, empty , isfrom=TRUE){
newfrom <- NULL
for(from in windfrom){
idx <- which.min(abs(from - empty))
startmass <- 0
if(isfrom){
if(empty[idx] < from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx-1]
}
}else{
if(empty[idx] > from) {
startmass <- empty[idx]
} else {
startmass <- empty[idx+1]
}
}
newfrom <- c(newfrom, round(startmass,digits=1))
}
return(newfrom)
}
.cdsw_compute_breaks <-
function(xx, nbins){
q <- quantile(xx, seq(0, 1, length = nbins + 1))
q[1] <- q[1] - 0.5
q[length(q)] <- q[length(q)] + 0.5
q <- round(q)
}
cdsw <-
function(x, massrange = c(300,1250), n = 20, overlap = 1.0, FUN, ...) {
if (class(x) == "psmSet") {
x <- unlist(lapply(x, function(x) {
x$pepmass
}))
} else if (class(x) == 'specLSet') {
x <- unlist(lapply(x@ionlibrary, function(xx) {
xx@q1
}))
}
# x should be numeric
if (class(x) != "numeric") {
warning("can not compute quantils. 'x' is not numeric.")
return (NULL)
}
x <- x[x > massrange[1] & x < massrange[2]]
q <- FUN(xx=x, nbins=n)
idx <- 1:n
from <- q[idx] - overlap * 0.5
to <- q[idx + 1] + overlap * 0.5
width <- 0.5 * (to - from)
mid <- from + width
h <- hist(x, breaks = q, ...)
data.frame(from, to, mid, width, counts = h$counts)
}
cdsw(exampledata,
freq=TRUE,
overlap = 0,
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## from to mid width counts
## 0% 301 384 342.5 41.5 586
## 5% 384 420 402.0 18.0 591
## 10% 420 449 434.5 14.5 583
## 15% 449 476 462.5 13.5 599
## 20% 476 500 488.0 12.0 565
## 25% 500 524 512.0 12.0 604
## 30% 524 547 535.5 11.5 566
## 35% 547 572 559.5 12.5 598
## 40% 572 598 585.0 13.0 591
## 45% 598 625 611.5 13.5 577
## 50% 625 651 638.0 13.0 603
## 55% 651 682 666.5 15.5 575
## 60% 682 713 697.5 15.5 594
## 65% 713 745 729.0 16.0 574
## 70% 745 783 764.0 19.0 595
## 75% 783 825 804.0 21.0 572
## 80% 825 881 853.0 28.0 592
## 85% 881 948 914.5 33.5 583
## 90% 948 1045 996.5 48.5 592
## 95% 1045 1250 1147.5 102.5 586
.cdsw_objective <- function(splits, data){
counts <- hist(data, breaks=splits,plot=FALSE)$counts
nbins<-length(splits)-1
optimumN <- length(data)/(length(splits)-1)
optimumN<-rep(optimumN,nbins)
score2 <-sqrt(sum((counts - optimumN)^2))
score1 <- sum(abs(counts - round(optimumN)))
return(list(score1=score1,score2 = score2, counts=counts, optimumN=round(optimumN)))
}
.cdsw_hardconstrain <- function(splits, minwindow = 5, maxwindow=50){
difsp<-diff(splits)
return(sum(difsp >= minwindow) == length(difsp) & sum(difsp <= maxwindow) == length(difsp))
}
.cdsw_compute_sampling_breaks <- function(xx, nbins=35, maxwindow=150, minwindow = 5, plot=TRUE){
breaks <- nbins+1
#xx <- x
#xx<-xx[xx >=310 & xx<1250]
# TODO(wew): there is something insconsitent with the nbins parameter
qqs <- quantile(xx,probs = seq(0,1,by=1/(nbins)))
plot(1:breaks, qqs, type="b" ,
sub=".cdsw_compute_sampling_breaks")
legend("topleft", legend = c(paste("maxwindow = ", maxwindow),
paste("nbins = ", breaks) ))
# equidistant spaced bins
unif <- seq(min(xx),max(xx),length=(breaks))
lines(1:breaks,unif,col=2,type="b")
if(!.cdsw_hardconstrain(unif,minwindow = 5, maxwindow)){
warning("there is no way to generate bins given minwindow " , minwindow, "maxwindow", maxwindow, " breaks" , breaks, "\n")
}else{
.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)
}
mixeddata <- xx
it_count <- 0
error <- 0
while(!.cdsw_hardconstrain(qqs,minwindow = 5, maxwindow)){
it_count <- it_count + 1
uniformdata<-runif(round(length(xx)/20), min=min(xx), max=max(xx))
mixeddata<-c(mixeddata,uniformdata)
qqs <- quantile(mixeddata,probs = seq(0,1,by=1/(nbins)))
lines(1:breaks,qqs,type="l", col="#00DD00AA")
error[it_count] <-.cdsw_objective(qqs, xx)$score1
}
lines(1:breaks,qqs,type="b", col="#FF1111AA")
plot(error, xlab="number of iteration", sub=".cdsw_compute_sampling_breaks")
qqs <- as.numeric(sort(round(qqs)))
qqs[1] <- qqs[1] - 0.5
qqs[length(qqs)] <- qqs[length(qqs)] + 0.5
round(qqs, 1)
}
op <- par(mfrow=c(2,2))
par(mfrow=c(3,1))
wind <- cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
n=35,
massrange = c(350,1250),
sub='sampling based',
main = "peptideStd", xlab='pepmass', FUN=function(...){.cdsw_compute_sampling_breaks(...,maxwindow = 50)})
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
readjustWindows <- function(wind ,ms1data, breaks=10000, maxbin = 5){
res <- hist(ms1data, breaks=breaks)
abline(v=wind$from,col=2,lty=2)
abline(v=wind$to,col=3,lty=2)
empty <- res$mids[which(res$counts < maxbin )]
newfrom <- .makenewfromto(wind$from , empty)
newto <- .makenewfromto(wind$to , empty , isfrom=FALSE )
plot(res,xlim=c(1060,1065))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
plot(res,xlim=c(520,550))
abline(v = newfrom,lwd=0.5,col="red")
abline(v = newto , lwd=0.5,col="green")
width <- (newto - newfrom) * 0.5
mid <- (newfrom + newto)*0.5
newCounts <- NULL
for(i in 1:length(newfrom))
{
newCounts <- c(newCounts,sum(ms1data >= newfrom[i] & ms1data <= newto[i]))
}
data.frame(newfrom, newto, mid, width, counts =newCounts)
}
readjustWindows(wind,exampledata)
## newfrom newto mid width counts
## 1 349.5 379.1 364.30 14.80 305
## 2 379.0 403.1 391.05 12.05 355
## 3 403.0 423.1 413.05 10.05 349
## 4 423.0 442.1 432.55 9.55 401
## 5 442.0 461.1 451.55 9.55 382
## 6 461.0 479.1 470.05 9.05 424
## 7 479.0 496.1 487.55 8.55 414
## 8 496.0 513.0 504.50 8.50 423
## 9 513.0 530.0 521.50 8.50 402
## 10 530.0 546.0 538.00 8.00 410
## 11 546.0 564.0 555.00 9.00 433
## 12 564.0 581.0 572.50 8.50 393
## 13 580.8 600.0 590.40 9.60 444
## 14 600.0 618.0 609.00 9.00 394
## 15 618.0 637.0 627.50 9.50 421
## 16 637.0 655.0 646.00 9.00 404
## 17 655.0 676.0 665.50 10.50 371
## 18 676.0 695.0 685.50 9.50 394
## 19 695.0 716.0 705.50 10.50 383
## 20 716.0 737.0 726.50 10.50 369
## 21 737.0 758.0 747.50 10.50 383
## 22 758.0 783.0 770.50 12.50 352
## 23 783.0 806.0 794.50 11.50 331
## 24 806.0 831.0 818.50 12.50 321
## 25 831.0 859.0 845.00 14.00 293
## 26 859.0 889.0 874.00 15.00 288
## 27 889.0 918.1 903.55 14.55 278
## 28 918.0 951.1 934.55 16.55 266
## 29 951.0 985.1 968.05 17.05 236
## 30 985.0 1022.1 1003.55 18.55 212
## 31 1022.0 1063.0 1042.50 20.50 207
## 32 1063.0 1106.0 1084.50 21.50 204
## 33 1106.0 1151.0 1128.50 22.50 123
## 34 1151.0 1200.0 1175.50 24.50 101
## 35 1200.0 1249.5 1224.75 24.75 71
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=35,
overlap = 0,
sub='quantile based',
main = "peptideStd", xlab='pepmass', FUN=.cdsw_compute_breaks)
## Warning in plot.histogram(r, freq = freq1, col = col, border = border,
## angle = angle, : the AREAS in the plot are wrong -- rather use 'freq =
## FALSE'
## from to mid width counts
## 0% 301 363 332.0 31.0 332
## 2.857143% 363 390 376.5 13.5 343
## 5.714286% 390 410 400.0 10.0 328
## 8.571429% 410 429 419.5 9.5 331
## 11.42857% 429 445 437.0 8.0 339
## 14.28571% 445 462 453.5 8.5 347
## 17.14286% 462 476 469.0 7.0 339
## 20% 476 489 482.5 6.5 315
## 22.85714% 489 503 496.0 7.0 333
## 25.71429% 503 517 510.0 7.0 353
## 28.57143% 517 531 524.0 7.0 338
## 31.42857% 531 544 537.5 6.5 316
## 34.28571% 544 557 550.5 6.5 337
## 37.14286% 557 572 564.5 7.5 341
## 40% 572 586 579.0 7.0 337
## 42.85714% 586 601 593.5 7.5 324
## 45.71429% 601 617 609.0 8.0 350
## 48.57143% 617 632 624.5 7.5 332
## 51.42857% 632 646 639.0 7.0 321
## 54.28571% 646 663 654.5 8.5 342
## 57.14286% 663 682 672.5 9.5 340
## 60% 682 698 690.0 8.0 336
## 62.85714% 698 716 707.0 9.0 323
## 65.71429% 716 736 726.0 10.0 350
## 68.57143% 736 754 745.0 9.0 328
## 71.42857% 754 776 765.0 11.0 335
## 74.28571% 776 800 788.0 12.0 336
## 77.14286% 800 825 812.5 12.5 327
## 80% 825 855 840.0 15.0 344
## 82.85714% 855 891 873.0 18.0 330
## 85.71429% 891 928 909.5 18.5 334
## 88.57143% 928 969 948.5 20.5 340
## 91.42857% 969 1029 999.0 30.0 337
## 94.28571% 1029 1097 1063.0 34.0 332
## 97.14286% 1097 1250 1173.5 76.5 336
op <-par(mfrow=c(4,3))
res <- lapply(c(75,150,300, 800), function(mws){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
overlap = 0,
main = paste("max window size", mws), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = mws)
})
})
op <-par(mfrow=c(4,3))
res <- lapply(c(20,25,30, 40), function(nbins){
cdsw(exampledata,
freq=TRUE,
plot=TRUE,
n=nbins,
overlap = 0,
main = paste("nr bins", nbins), xlab='pepmass',
FUN=function(...){
.cdsw_compute_sampling_breaks(...,maxwindow = 100)
})
})
Here is the output of sessionInfo()
on the system on which this document was compiled:
## R version 3.4.0 (2017-04-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.2 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.5-bioc/R/lib/libRblas.so
## LAPACK: /home/biocbuild/bbs-3.5-bioc/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] specL_1.10.0 seqinr_3.3-6 RSQLite_1.1-2 protViz_0.2.28
## [5] DBI_0.6-1 BiocStyle_2.4.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.10 digest_0.6.12 rprojroot_1.2 backports_1.0.5
## [5] magrittr_1.5 evaluate_0.10 stringi_1.1.5 rmarkdown_1.4
## [9] tools_3.4.0 ade4_1.7-6 stringr_1.2.0 yaml_2.1.14
## [13] compiler_3.4.0 memoise_1.1.0 htmltools_0.3.5 knitr_1.15.1