## <script type="text/javascript">
## document.addEventListener("DOMContentLoaded", function() {
## document.querySelector("h1").style.marginTop = "0";
## });
## </script>
## <script type="text/javascript">
## document.addEventListener("DOMContentLoaded", function() {
## var links = document.links;
## for (var i = 0, linksLength = links.length; i < linksLength; i++)
## if (links[i].hostname != window.location.hostname)
## links[i].target = '_blank';
## });
## </script>
The aim of this document is to document to mapping of proteins and the tandem mass spectrometry derived peptides they have been inferred from to genomic locations.
We will use a small object from Pbase
to illustrate how to retrieve
genome coordinates and map a protein back to genomic coordinates. In
the remainder of this vignette, we will concentrate on a spectific
protein, namely P00558.
library("Pbase")
data(p)
p
## S4 class type : Proteins
## Class version : 0.1
## Created : Wed Jul 16 02:58:33 2014
## Number of Proteins: 9
## Sequences:
## [1] A4UGR9 [2] A6H8Y1 ... [8] P04075-2 [9] P60709
## Sequence features:
## [1] DB [2] AccessionNumber ... [10] Comment [11] Filename
## Peptide features:
## [1] DB [2] AccessionNumber ... [46] spectrumFile [47] databaseFile
seqnames(p)
## [1] "A4UGR9" "A6H8Y1" "O43707" "O75369" "P00558" "P02545"
## [7] "P04075" "P04075-2" "P60709"
dat <- data.frame(i = 1:length(p),
npeps = elementLengths(pfeatures(p)),
protln = width(aa(p)))
dat
## i npeps protln
## A4UGR9 1 36 3374
## A6H8Y1 2 23 2624
## O43707 3 6 911
## O75369 4 13 2602
## P00558 5 5 417
## P02545 6 12 664
## P04075 7 21 364
## P04075-2 8 20 418
## P60709 9 1 375
sp <- seqnames(p)[5]
biomaRt
The input information consists of a UniProt identifier and a set of peptides positions along the protein. The first step is to map the protein accession number to a gene identifier (here, we will use Ensembl) and to obtain genome coordinates.
Multiple solutiona are offered to use. Below, we use the biomaRt
Bioconductor package.
library("biomaRt")
ens <- useMart("ensembl", "hsapiens_gene_ensembl")
bm <- select(ens, keys = sp,
keytype = "uniprot_swissprot_accession",
columns = c(
"ensembl_gene_id",
"ensembl_transcript_id",
"ensembl_peptide_id",
"ensembl_exon_id",
"description",
"chromosome_name",
"strand",
"exon_chrom_start",
"exon_chrom_end",
"5_utr_start",
"5_utr_end",
"3_utr_start",
"3_utr_end",
"start_position",
"end_position"))
bm
## ensembl_gene_id ensembl_transcript_id ensembl_peptide_id
## 1 ENSG00000102144 ENST00000373316 ENSP00000362413
## 2 ENSG00000102144 ENST00000373316 ENSP00000362413
## 3 ENSG00000102144 ENST00000373316 ENSP00000362413
## 4 ENSG00000102144 ENST00000373316 ENSP00000362413
## 5 ENSG00000102144 ENST00000373316 ENSP00000362413
## 6 ENSG00000102144 ENST00000373316 ENSP00000362413
## 7 ENSG00000102144 ENST00000373316 ENSP00000362413
## 8 ENSG00000102144 ENST00000373316 ENSP00000362413
## 9 ENSG00000102144 ENST00000373316 ENSP00000362413
## 10 ENSG00000102144 ENST00000373316 ENSP00000362413
## 11 ENSG00000102144 ENST00000373316 ENSP00000362413
## 12 ENSG00000269666 ENST00000597340 ENSP00000472461
## 13 ENSG00000269666 ENST00000597340 ENSP00000472461
## 14 ENSG00000269666 ENST00000597340 ENSP00000472461
## 15 ENSG00000269666 ENST00000597340 ENSP00000472461
## 16 ENSG00000269666 ENST00000597340 ENSP00000472461
## 17 ENSG00000269666 ENST00000597340 ENSP00000472461
## 18 ENSG00000269666 ENST00000597340 ENSP00000472461
## 19 ENSG00000269666 ENST00000597340 ENSP00000472461
## 20 ENSG00000269666 ENST00000597340 ENSP00000472461
## 21 ENSG00000269666 ENST00000597340 ENSP00000472461
## 22 ENSG00000269666 ENST00000597340 ENSP00000472461
## ensembl_exon_id description
## 1 ENSE00001600900 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 2 ENSE00003506377 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 3 ENSE00003502842 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 4 ENSE00003512377 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 5 ENSE00003581136 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 6 ENSE00003597777 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 7 ENSE00000672997 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 8 ENSE00000672996 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 9 ENSE00000672995 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 10 ENSE00000672994 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 11 ENSE00001948816 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 12 ENSE00003177038 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 13 ENSE00003462979 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 14 ENSE00003487079 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 15 ENSE00003549531 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 16 ENSE00003639123 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 17 ENSE00003515683 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 18 ENSE00003045537 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 19 ENSE00003016554 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 20 ENSE00003003175 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 21 ENSE00003138826 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## 22 ENSE00003081515 phosphoglycerate kinase 1 [Source:HGNC Symbol;Acc:8896]
## chromosome_name strand exon_chrom_start exon_chrom_end 5_utr_start
## 1 X 1 77359671 77359902 77359671
## 2 X 1 77365364 77365414 NA
## 3 X 1 77369241 77369396 NA
## 4 X 1 77369513 77369657 NA
## 5 X 1 77372809 77372912 NA
## 6 X 1 77373548 77373667 NA
## 7 X 1 77378332 77378446 NA
## 8 X 1 77378692 77378871 NA
## 9 X 1 77380371 77380548 NA
## 10 X 1 77380824 77380922 NA
## 11 X 1 77381287 77384793 NA
## 12 HG1426_PATCH 1 77365128 77365359 77365128
## 13 HG1426_PATCH 1 77370821 77370871 NA
## 14 HG1426_PATCH 1 77374698 77374853 NA
## 15 HG1426_PATCH 1 77374970 77375114 NA
## 16 HG1426_PATCH 1 77378266 77378369 NA
## 17 HG1426_PATCH 1 77379005 77379124 NA
## 18 HG1426_PATCH 1 77383789 77383903 NA
## 19 HG1426_PATCH 1 77384149 77384328 NA
## 20 HG1426_PATCH 1 77385828 77386005 NA
## 21 HG1426_PATCH 1 77386281 77386379 NA
## 22 HG1426_PATCH 1 77386744 77390250 NA
## 5_utr_end 3_utr_start 3_utr_end start_position end_position
## 1 77359837 NA NA 77320685 77384793
## 2 NA NA NA 77320685 77384793
## 3 NA NA NA 77320685 77384793
## 4 NA NA NA 77320685 77384793
## 5 NA NA NA 77320685 77384793
## 6 NA NA NA 77320685 77384793
## 7 NA NA NA 77320685 77384793
## 8 NA NA NA 77320685 77384793
## 9 NA NA NA 77320685 77384793
## 10 NA NA NA 77320685 77384793
## 11 NA 77381328 77384793 77320685 77384793
## 12 77365294 NA NA 77326142 77390250
## 13 NA NA NA 77326142 77390250
## 14 NA NA NA 77326142 77390250
## 15 NA NA NA 77326142 77390250
## 16 NA NA NA 77326142 77390250
## 17 NA NA NA 77326142 77390250
## 18 NA NA NA 77326142 77390250
## 19 NA NA NA 77326142 77390250
## 20 NA NA NA 77326142 77390250
## 21 NA NA NA 77326142 77390250
## 22 NA 77386785 77390250 77326142 77390250
We will only consider the gene located on the X chromosome and ignore the sequence patch and conveniently store the transcript and protein identifiers for later use.
bm <- bm[bm$chromosome_name == "X", ]
tr <- unique(bm$ensembl_transcript_id)
pr <- unique(bm$ensembl_peptide_id)
The information retrieved gives us various information, in particular the Ensembl gene identifer ENSG00000102144 and its starting and ending position 77320685 and 77384793 on chromosome X.
Gviz
While the above defines all necessary genomic coordinates, and as we
will make use of the Gviz
plotting infrastructure, it is more
straightforward to fetch the above information via biomaRt
using
specialised Gviz
infrastructure.
library("Gviz")
options(ucscChromosomeNames=FALSE)
bmTrack <- BiomartGeneRegionTrack(start = min(bm$start_position),
end = max(bm$end_position),
chromosome = "chrX",
genome = "hg19")
## see also the filters param and getBM to
## filter on biotype == "protein_coding"
bmTracks <- split(bmTrack, transcript(bmTrack))
grTrack <- bmTracks[[which(names(bmTracks) == tr)]]
names(grTrack) <- tr
We can convince ourselves that the manual retrieved data in bm
and
the data in the BiomartRegionTrack
subsequently subset into a
GeneRegionTrack
match.
as(grTrack, "data.frame")
## X.seqnames X.start X.end X.width X.strand X.feature
## 1 X 77359671 77359837 167 + utr5
## 2 X 77359838 77359902 65 + protein_coding
## 3 X 77365364 77365414 51 + protein_coding
## 4 X 77369241 77369396 156 + protein_coding
## 5 X 77369513 77369657 145 + protein_coding
## 6 X 77372809 77372912 104 + protein_coding
## 7 X 77373548 77373667 120 + protein_coding
## 8 X 77378332 77378446 115 + protein_coding
## 9 X 77378692 77378871 180 + protein_coding
## 10 X 77380371 77380548 178 + protein_coding
## 11 X 77380824 77380922 99 + protein_coding
## 12 X 77381287 77381327 41 + protein_coding
## 13 X 77381328 77384793 3466 + utr3
## X.gene X.exon X.transcript X.symbol X.rank X.phase
## 1 ENSG00000102144 ENSE00001600900 ENST00000373316 PGK1 1 -1
## 2 ENSG00000102144 ENSE00001600900 ENST00000373316 PGK1 1 -1
## 3 ENSG00000102144 ENSE00003506377 ENST00000373316 PGK1 2 2
## 4 ENSG00000102144 ENSE00003502842 ENST00000373316 PGK1 3 2
## 5 ENSG00000102144 ENSE00003512377 ENST00000373316 PGK1 4 2
## 6 ENSG00000102144 ENSE00003581136 ENST00000373316 PGK1 5 0
## 7 ENSG00000102144 ENSE00003597777 ENST00000373316 PGK1 6 2
## 8 ENSG00000102144 ENSE00000672997 ENST00000373316 PGK1 7 2
## 9 ENSG00000102144 ENSE00000672996 ENST00000373316 PGK1 8 0
## 10 ENSG00000102144 ENSE00000672995 ENST00000373316 PGK1 9 0
## 11 ENSG00000102144 ENSE00000672994 ENST00000373316 PGK1 10 1
## 12 ENSG00000102144 ENSE00001948816 ENST00000373316 PGK1 11 0
## 13 ENSG00000102144 ENSE00001948816 ENST00000373316 PGK1 11 -1
## feature gene exon transcript symbol
## 1 utr5 ENSG00000102144 ENSE00001600900 ENST00000373316 PGK1
## 2 protein_coding ENSG00000102144 ENSE00001600900 ENST00000373316 PGK1
## 3 protein_coding ENSG00000102144 ENSE00003506377 ENST00000373316 PGK1
## 4 protein_coding ENSG00000102144 ENSE00003502842 ENST00000373316 PGK1
## 5 protein_coding ENSG00000102144 ENSE00003512377 ENST00000373316 PGK1
## 6 protein_coding ENSG00000102144 ENSE00003581136 ENST00000373316 PGK1
## 7 protein_coding ENSG00000102144 ENSE00003597777 ENST00000373316 PGK1
## 8 protein_coding ENSG00000102144 ENSE00000672997 ENST00000373316 PGK1
## 9 protein_coding ENSG00000102144 ENSE00000672996 ENST00000373316 PGK1
## 10 protein_coding ENSG00000102144 ENSE00000672995 ENST00000373316 PGK1
## 11 protein_coding ENSG00000102144 ENSE00000672994 ENST00000373316 PGK1
## 12 protein_coding ENSG00000102144 ENSE00001948816 ENST00000373316 PGK1
## 13 utr3 ENSG00000102144 ENSE00001948816 ENST00000373316 PGK1
## rank phase
## 1 1 -1
## 2 1 -1
## 3 2 2
## 4 3 2
## 5 4 2
## 6 5 0
## 7 6 2
## 8 7 2
## 9 8 0
## 10 9 0
## 11 10 1
## 12 11 0
## 13 11 -1
Below, we create additional ideogram and axis tracks and produce a visualisation of our genomic coordinates of interest.
ideoTrack <- IdeogramTrack(genome = "hg19",
chromosome = "chrX")
axisTrack <- GenomeAxisTrack()
seqlevels(ranges(grTrack)) <-
chromosome(grTrack) <-
chromosome(ideoTrack)
plotTracks(list(ideoTrack, axisTrack, grTrack),
add53 = TRUE, add35 = TRUE)
TranscriptDb
instancesFinally, on could also use TranscriptDb
objects to create the
GeneRegionTrack
. Below, we use the UCSC annotation (which is
directly available from Bioconductor; Ensembl TranscriptDb instances
could be generated manually - TODO document this) to extract our
regions of interest.
library("TxDb.Hsapiens.UCSC.hg19.knownGene")
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txTr <- GeneRegionTrack(txdb, chromosome = "chrX",
start = bm$start_position[1],
end = bm$end_position[1])
head(as(txTr, "data.frame"))
## X.seqnames X.start X.end X.width X.strand X.feature X.id
## 1 chrX 77359666 77359837 172 + utr5 unknown
## 2 chrX 77359838 77359902 65 + CDS unknown
## 3 chrX 77361859 77362168 310 + utr5 unknown
## 4 chrX 77365364 77365382 19 + utr5 unknown
## 5 chrX 77365364 77365414 51 + CDS unknown
## 6 chrX 77365383 77365414 32 + CDS unknown
## X.exon X.transcript X.gene X.symbol X.density feature id
## 1 uc004ecz.4_1 uc004ecz.4 5230 uc004ecz.4 1 utr5 unknown
## 2 uc004ecz.4_1 uc004ecz.4 5230 uc004ecz.4 1 CDS unknown
## 3 uc011mqq.2_1 uc011mqq.2 5230 uc011mqq.2 1 utr5 unknown
## 4 uc011mqq.2_2 uc011mqq.2 5230 uc011mqq.2 1 utr5 unknown
## 5 uc004ecz.4_2 uc004ecz.4 5230 uc004ecz.4 1 CDS unknown
## 6 uc011mqq.2_2 uc011mqq.2 5230 uc011mqq.2 1 CDS unknown
## exon transcript gene symbol density
## 1 uc004ecz.4_1 uc004ecz.4 5230 uc004ecz.4 1
## 2 uc004ecz.4_1 uc004ecz.4 5230 uc004ecz.4 1
## 3 uc011mqq.2_1 uc011mqq.2 5230 uc011mqq.2 1
## 4 uc011mqq.2_2 uc011mqq.2 5230 uc011mqq.2 1
## 5 uc004ecz.4_2 uc004ecz.4 5230 uc004ecz.4 1
## 6 uc011mqq.2_2 uc011mqq.2 5230 uc011mqq.2 1
The peptides that have been experimentally observed are available as ranges (coordinates) along the protein sequences. For example, below, we see 5 peptides (ELNYFAKALESPER, DLMSKAEK, QIVWNGPVGVFEWEAFAR, FHVEEEGKGKDASGNK, GTKALMDEVVK) have been identified for our protein of interest P00558.
pp <- p[sp]
pranges(pp)
## IRangesList of length 1
## $P00558
## IRanges of length 5
## start end width names
## [1] 193 206 14 P00558
## [2] 268 275 8 P00558
## [3] 333 350 18 P00558
## [4] 124 139 16 P00558
## [5] 351 361 11 P00558
plot(pp)
The aim of this document is to document the mapping of peptides, i.e. ranges along a protein sequence to ranges along the genome reference. In other words, our aim is the convert protein coordinates to genome coordinates.
Additional features of interest (in our case peptides) can be added to the genomic visualisation using dedicated annotation tracks, that can be constructed as shown below.
The first check that we want to implement is to verify that we can regenerate the protein amino acid sequence from the genome regions that we have extracted. We start by subsetting only the actual protein coding regions, i.e ignoring the 5' and 3' untranslated regions of our Genome Region track.
(prng <- ranges(grTrack[feature(grTrack) == "protein_coding",]))
## GRanges object with 11 ranges and 7 metadata columns:
## seqnames ranges strand | feature
## <Rle> <IRanges> <Rle> | <character>
## [1] chrX [77359838, 77359902] + | protein_coding
## [2] chrX [77365364, 77365414] + | protein_coding
## [3] chrX [77369241, 77369396] + | protein_coding
## [4] chrX [77369513, 77369657] + | protein_coding
## [5] chrX [77372809, 77372912] + | protein_coding
## ... ... ... ... ... ...
## [7] chrX [77378332, 77378446] + | protein_coding
## [8] chrX [77378692, 77378871] + | protein_coding
## [9] chrX [77380371, 77380548] + | protein_coding
## [10] chrX [77380824, 77380922] + | protein_coding
## [11] chrX [77381287, 77381327] + | protein_coding
## gene exon transcript symbol
## <character> <character> <character> <character>
## [1] ENSG00000102144 ENSE00001600900 ENST00000373316 PGK1
## [2] ENSG00000102144 ENSE00003506377 ENST00000373316 PGK1
## [3] ENSG00000102144 ENSE00003502842 ENST00000373316 PGK1
## [4] ENSG00000102144 ENSE00003512377 ENST00000373316 PGK1
## [5] ENSG00000102144 ENSE00003581136 ENST00000373316 PGK1
## ... ... ... ... ...
## [7] ENSG00000102144 ENSE00000672997 ENST00000373316 PGK1
## [8] ENSG00000102144 ENSE00000672996 ENST00000373316 PGK1
## [9] ENSG00000102144 ENSE00000672995 ENST00000373316 PGK1
## [10] ENSG00000102144 ENSE00000672994 ENST00000373316 PGK1
## [11] ENSG00000102144 ENSE00001948816 ENST00000373316 PGK1
## rank phase
## <numeric> <integer>
## [1] 1 -1
## [2] 2 2
## [3] 3 2
## [4] 4 2
## [5] 5 0
## ... ... ...
## [7] 7 2
## [8] 8 0
## [9] 9 0
## [10] 10 1
## [11] 11 0
## -------
## seqinfo: 1 sequence from hg19 genome; no seqlengths
We also need the actual genome sequence (so far, we have only dealt
with regions and features). There are multiple ways to obtain genome
sequences with R and Bioconductor. As we have been working with the
Ensembl reference genome, we could simply
download
the whole genome of only the chromosome X and load it as an
AAStringSet
and extract the protein coding regions of interest at
the coordinates defined by the ranges in prng
.
library("Biostrings")
chrx <- readDNAStringSet("./Homo_sapiens.GRCh37.75.dna.chromosome.X.fa.gz")
seq <- extractAt(chrx[[1]], ranges(prng))
pptr <- translate(unlist(seq))
It is also possible to use readily package genomes to do this. Above,
we have use the TxDb.Hsapiens.UCSC.hg19.knownGene
package defining
transcripts. There is a similar package for the human UCSC genome
build, namely BSgenome.Hsapiens.UCSC.hg19
. However, as we have
focused on Ensembl data annotation, we will first need to convert our
Ensembl transcript (or protein) identifier ENST00000373316 (ENSP00000362413) into the
UCSC equivalent. (TODO Use also Homo.sapiens
annotation package.)
ucsc <- select(ens, key = pr,
keytype = "ensembl_peptide_id",
columns = "ucsc")
ucsc
## ucsc
## 1 uc004ecz.4
library("BSgenome.Hsapiens.UCSC.hg19")
prng2 <- ranges(txTr[transcript(txTr) %in% ucsc])
prng2 <- prng2[prng2$feature == "CDS"]
seq <- getSeq(BSgenome.Hsapiens.UCSC.hg19, prng2)
pptr <- translate(unlist(seq))
## remove stop codon
pptr <- pptr[1:417]
writePairwiseAlignments(pairwiseAlignment(pp[[1]], pptr))
## ########################################
## # Program: Biostrings (version 2.34.0), a Bioconductor package
## # Rundate: Mon Oct 13 21:06:35 2014
## ########################################
## #=======================================
## #
## # Aligned_sequences: 2
## # 1: P1
## # 2: S1
## # Matrix: NA
## # Gap_penalty: 14.0
## # Extend_penalty: 4.0
## #
## # Length: 417
## # Identity: 417/417 (100.0%)
## # Similarity: NA/417 (NA%)
## # Gaps: 0/417 (0.0%)
## # Score: 1794.636
## #
## #
## #=======================================
##
## P1 1 MSLSNKLTLDKLDVKGKRVVMRVDFNVPMKNNQITNNQRIKAAVPSIKFC 50
## ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1 1 MSLSNKLTLDKLDVKGKRVVMRVDFNVPMKNNQITNNQRIKAAVPSIKFC 50
##
## P1 51 LDNGAKSVVLMSHLGRPDGVPMPDKYSLEPVAVELKSLLGKDVLFLKDCV 100
## ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1 51 LDNGAKSVVLMSHLGRPDGVPMPDKYSLEPVAVELKSLLGKDVLFLKDCV 100
##
## P1 101 GPEVEKACANPAAGSVILLENLRFHVEEEGKGKDASGNKVKAEPAKIEAF 150
## ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1 101 GPEVEKACANPAAGSVILLENLRFHVEEEGKGKDASGNKVKAEPAKIEAF 150
##
## P1 151 RASLSKLGDVYVNDAFGTAHRAHSSMVGVNLPQKAGGFLMKKELNYFAKA 200
## ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1 151 RASLSKLGDVYVNDAFGTAHRAHSSMVGVNLPQKAGGFLMKKELNYFAKA 200
##
## P1 201 LESPERPFLAILGGAKVADKIQLINNMLDKVNEMIIGGGMAFTFLKVLNN 250
## ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1 201 LESPERPFLAILGGAKVADKIQLINNMLDKVNEMIIGGGMAFTFLKVLNN 250
##
## P1 251 MEIGTSLFDEEGAKIVKDLMSKAEKNGVKITLPVDFVTADKFDENAKTGQ 300
## ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1 251 MEIGTSLFDEEGAKIVKDLMSKAEKNGVKITLPVDFVTADKFDENAKTGQ 300
##
## P1 301 ATVASGIPAGWMGLDCGPESSKKYAEAVTRAKQIVWNGPVGVFEWEAFAR 350
## ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1 301 ATVASGIPAGWMGLDCGPESSKKYAEAVTRAKQIVWNGPVGVFEWEAFAR 350
##
## P1 351 GTKALMDEVVKATSRGCITIIGGGDTATCCAKWNTEDKVSHVSTGGGASL 400
## ||||||||||||||||||||||||||||||||||||||||||||||||||
## S1 351 GTKALMDEVVKATSRGCITIIGGGDTATCCAKWNTEDKVSHVSTGGGASL 400
##
## P1 401 ELLEGKVLPGVDALSNI 417
## |||||||||||||||||
## S1 401 ELLEGKVLPGVDALSNI 417
##
##
## #---------------------------------------
## #---------------------------------------
To calculate the genomic coordinates of peptides we
1) Calculate the contiguous exon coordinates on the cDNA sequence
prex
.
2) Calculate the peptide corrdinates on the cDNA sequence peprgn2
from the original peptide ranges along the protein sequence.
3) We calculate the indices of the exons in which all the peptides
start and end on the cDNA sequence start_ex
and end_ex
.
4) We infer the coordinates on the genome from the actual peptide
starting/ending position and exon index on the cDNA sequence, the
exons coordinates on the cDNA sequences and the matching exons
coordinates on the genome and store these in an IRanges
object.
## exons ranges along the protein
j <- cumsum(width(seq))
i <- cumsum(c(1, width(seq)))[1:length(j)]
prex <- IRanges(start = i, end = j)
peprng <- pranges(pp)[[1]]
## peptide positions on cdna
peprng2 <- IRanges(start = 1 + (start(peprng)-1) * 3,
width = width(peprng) * 3)
## find exon and position in prex
start_ex <- subjectHits(findOverlaps(start(peprng2), prex))
end_ex <- subjectHits(findOverlaps(end(peprng2), prex))
getPos <- function(p, i, prtex = prex, nclex = prng2) {
## position in cdna
## exon index
start(prng2[i]) + (p - start(prtex[i]))
}
peptides_on_genome <- IRanges(start = getPos(start(peprng2), start_ex),
end = getPos(end(peprng2), end_ex))
Based on the new peptide genomic coordinates, it is now
straightforward to create a new AnnotationTrack
and add it the the
track visualisation.
pepTr <- AnnotationTrack(start = start(peptides_on_genome),
end = end(peptides_on_genome),
chr = "chrX", genome = "hg19",
strand = "*",
id = pcols(pp)[[1]]$pepseq,
name = "pfeatures",
col = "steelblue")
plotTracks(list(ideoTrack, axisTrack, grTrack, pepTr),
groupAnnotation = "id",
just.group = "below",
fontsize.group = 9,
add53 = TRUE, add35 = TRUE)
Finally, we customise the figure by adding a track with the \(MS^2\)
spectra. The raw data used to search the protein database an create
p
are available as an MSnExp
object.
data(pms)
library("ggplot2")
details <- function(identifier, ...) {
p <- plot(pms[[as.numeric(identifier)]], full=TRUE, plot=FALSE) + ggtitle("")
p <- p + theme_bw() + theme(axis.text.y = element_blank(),
axis.text.x = element_blank()) +
labs(x = NULL, y = NULL)
print(p, newpage=FALSE)
}
deTrack <- AnnotationTrack(start = start(peptides_on_genome),
end = end(peptides_on_genome),
genome = "hg19", chromosom = "chrX",
id = pcols(pp)[[1]]$acquisitionnum,
name = "MS2 spectra",
stacking = "squish", fun = details)
plotTracks(list(ideoTrack, axisTrack, deTrack, grTrack),
add53 = TRUE, add35 = TRUE)
TODO Check spectra. Describe how data tracks can be used to overlay additional information, such as quantitation data, identification scores, coverage, …
sessionInfo()
## R version 3.1.1 Patched (2014-09-25 r66681)
## Platform: x86_64-unknown-linux-gnu (64-bit)
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 grid parallel stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] TxDb.Hsapiens.UCSC.hg19.knownGene_3.0.0
## [2] GenomicFeatures_1.18.0
## [3] AnnotationDbi_1.28.0
## [4] Biobase_2.26.0
## [5] ggplot2_1.0.0
## [6] BSgenome.Hsapiens.UCSC.hg19_1.4.0
## [7] BSgenome_1.34.0
## [8] rtracklayer_1.26.0
## [9] biomaRt_2.22.0
## [10] mzID_1.4.0
## [11] Biostrings_2.34.0
## [12] XVector_0.6.0
## [13] Pbase_0.4.0
## [14] Gviz_1.10.0
## [15] GenomicRanges_1.18.0
## [16] GenomeInfoDb_1.2.0
## [17] IRanges_2.0.0
## [18] S4Vectors_0.4.0
## [19] Rcpp_0.11.3
## [20] BiocGenerics_0.12.0
## [21] BiocStyle_1.4.0
##
## loaded via a namespace (and not attached):
## [1] BBmisc_1.7 BatchJobs_1.4
## [3] BiocInstaller_1.16.0 BiocParallel_1.0.0
## [5] DBI_0.3.1 Formula_1.1-2
## [7] GenomicAlignments_1.2.0 Hmisc_3.14-5
## [9] MALDIquant_1.11 MASS_7.3-35
## [11] MSnbase_1.14.0 Pviz_1.0.0
## [13] R.methodsS3_1.6.1 RColorBrewer_1.0-5
## [15] RCurl_1.95-4.3 RSQLite_0.11.4
## [17] Rsamtools_1.18.0 VariantAnnotation_1.12.0
## [19] XML_3.98-1.1 acepack_1.3-3.3
## [21] affy_1.44.0 affyio_1.34.0
## [23] base64enc_0.1-2 biovizBase_1.14.0
## [25] bitops_1.0-6 brew_1.0-6
## [27] checkmate_1.4 chron_2.3-45
## [29] cleaver_1.4.0 cluster_1.15.3
## [31] codetools_0.2-9 colorspace_1.2-4
## [33] data.table_1.9.4 dichromat_2.0-0
## [35] digest_0.6.4 doParallel_1.0.8
## [37] evaluate_0.5.5 fail_1.2
## [39] foreach_1.4.2 foreign_0.8-61
## [41] formatR_1.0 gtable_0.1.2
## [43] impute_1.40.0 iterators_1.0.7
## [45] knitr_1.7 labeling_0.3
## [47] lattice_0.20-29 latticeExtra_0.6-26
## [49] limma_3.22.0 markdown_0.7.4
## [51] matrixStats_0.10.0 mime_0.2
## [53] munsell_0.4.2 mzR_2.0.0
## [55] nnet_7.3-8 pcaMethods_1.56.0
## [57] plyr_1.8.1 preprocessCore_1.28.0
## [59] proto_0.3-10 reshape2_1.4
## [61] rpart_4.1-8 scales_0.2.4
## [63] sendmailR_1.2-1 splines_3.1.1
## [65] stringr_0.6.2 survival_2.37-7
## [67] tools_3.1.1 vsn_3.34.0
## [69] zlibbioc_1.12.0