This file contains the full code for replicating the analysis on our manuscript.
single 1.7.0
This code reproduces the analysis made on our manuscript “Accurate gene consensus at low nanopore coverage”, from the raw fast5 files returned by the Nanopore Sequencer to final figures.
Here, we applied SINGLe to the gene of KlenTaq, a truncated variant of the well-known Taq polymerase, of approximately 1.7 kb in length. We first tested it on a small set of seven known mutants containing 2 to 9 point mutations (Small Set, or KT7), and later a larger library of approximately 1200 variants (KTLib).
Make sure that in the path were you are running this code there is:
The chunks on the general inputs section load packages, functions and variables that are needed for different chunks below. They do not use a lot of memory, so I’d recommend running them all before running the particular section you are interested in.
Also, sections are not independent between them. The clearest example is the single_train variable that it is obtained in the Single correction for the small library and then it is used for the correction of the large library long after.
You’ll need to have installed the following software:
This includes loading needed packages, defining functions and loading data that will be used in some parts of the code. It’s fast and light, so just run all chunks before going on.
Packages
require(single)
suppressPackageStartupMessages(require(plyr))
suppressPackageStartupMessages(require(dplyr))
options(dplyr.summarise.inform=F)
suppressPackageStartupMessages(require(reshape2))
suppressPackageStartupMessages(require(foreach))
suppressPackageStartupMessages(require(ggplot2))
suppressPackageStartupMessages(require(ggbreak))
suppressPackageStartupMessages(require(grid))
suppressPackageStartupMessages(require(gridExtra))
suppressPackageStartupMessages(require(stringr))
suppressPackageStartupMessages(library(Biostrings))
suppressPackageStartupMessages(require(cowplot))
suppressPackageStartupMessages(library(e1071))
Functions
detect_homopolymer_positions <- function(sequence){
l <- length(sequence)
homopolymer_positions <- list()
aux_positions <- 1
counter <- 0
for(i in seq_along(sequence)[-l]){
if (sequence[i+1]==sequence[i]){
aux_positions <- c(aux_positions,i+1)
}else{
counter <- counter+1
homopolymer_positions[[counter]] <- aux_positions
aux_positions <- i+1
}
}
if(sequence[i+1]==sequence[i]){
counter <- counter+1
homopolymer_positions[[counter]] <- aux_positions
}
return(homopolymer_positions)
}
biostr_to_char <- function(x){
y <- strsplit(as.character(x), split="")[[1]]
return(y)
}
detect_mutations <- function(sequence,reference){
ind <- which(sequence != reference)
muts <- paste0(reference[ind],ind,sequence[ind])
return(muts)
}
replace_str_in_pos <- function(string, pos, replacement){
if(pos>nchar(string)){stop('Replacement intended outsife string')}
string_vec <- strsplit(string, split="")[[1]]
string_vec[pos] <- replacement
string_out <- paste(string_vec, collapse = "")
return(string_out)
}
bc_one_mutation <- function(x){
bases <- c("A","C","G","T")
y <- c()
n <- 0
for(i in 1:nchar(x)){
n <- n+1
non_wt <- setdiff(bases, substr(x, i,i))
non_wt <- paste0(non_wt, collapse = "")
non_wt <- paste0("[",non_wt, "]", collapse = "")
y[n] <- replace_str_in_pos(x, i, non_wt)
}
for(i in 2:(nchar(x)-1)){
n <- n+1
y[n] <- paste0(substr(x,1,(i-1)),substr(x,(i+1), nchar(x)))
}
y <- unique(y)
return(y)
}
dna_reverse_complement_vector <- function(x){
y <- rev(dna_complement_vector(x))
return(y)
}
dna_complement <- function(x){
if(length(x)>1){stop("dna_complement: x has length > 1")}
if(x=="A" | x=="a"){ y <- "T"}
if(x=="C" | x=="c"){ y <- "G"}
if(x=="G" | x=="g"){ y <- "C"}
if(x=="T" | x=="t"){ y <- "A"}
return(y)
}
dna_complement_vector <- function(x){
y <- vapply(x, dna_complement, USE.NAMES = F)
return(y)
}
dna_complement_string <- function(x){
x <- strsplit(x, split="")[[1]]
y <- dna_complement_vector(x)
y <- paste(y, collapse = "")
return(y)
}
dna_reverse_complement_string <- function(x){
x <- strsplit(x, split="")[[1]]
y <- dna_reverse_complement_vector(x)
y <- paste(y, collapse = "")
return(y)
}
Paths
path_references <- "1_ReferenceFiles/"
path_raw_data <- "2_NanoporeRawReads/"
path_save_figures <- "6_Figures/"
path_analysis_lib7 <- "3_Analysis/KT7_1700_2100/"
path_analysis_lib <- "3_Analysis/KTLib/"
if(!dir.exists(path_save_figures)){dir.create(path_save_figures)}
if(!dir.exists(path_analysis_lib7)){dir.create(path_analysis_lib7)}
if(!dir.exists(path_analysis_lib)){dir.create(path_analysis_lib)}
Colors
red <- rgb(218,0,0,maxColorValue = 255)
green <- rgb(0,122,0,maxColorValue = 255)
red_tr <- rgb(230,0,0,alpha = 200, maxColorValue = 255)
blue_tr <- rgb(0,0,218,alpha = 250, maxColorValue = 255)
green_tr <- rgb(0,122,0,alpha = 200, maxColorValue = 255)
blue <- rgb(0,0,250, maxColorValue = 255)
single_color <- "#2727fa"
medaka_color <- "#26a026"
guppy_color <- "#fa2929"
nanopolish_color <- "#fa8c28"
noweights_color <- "#8c26fa"
racon_color <- "#dcdc26"
nextpolish_color <- grey(.6)
colors_vector <- c(single_color,medaka_color,guppy_color,nanopolish_color,noweights_color, racon_color,nextpolish_color)
names(colors_vector) <- c("single","medaka","nanopore","nanopolish","no_weights", "racon","nextpolish")
colors_vector_capital <- colors_vector
names(colors_vector_capital) <- c("SINGLe","Medaka","Guppy","Nanopolish","No weights","Racon","NextPolish")
methods_capital <- setNames(c("Nanopolish","Medaka","Racon","SINGLe","Guppy","No weights","NextPolish"),
c("nanopolish","medaka","racon","single","nanopore","no_weights","nextpolish"))
Plotting themes
theme_plot <- theme_bw()+
theme(line=element_line(colour = "black",
size = 0.5, linetype = "dashed",lineend = "square"),
rect= element_rect(fill = NULL, colour = NULL,
size = 1,linetype = "solid",inherit.blank = FALSE),
text=element_text(size = 10),title = element_text(size=10),
axis.title = element_text(size=rel(1)),
axis.text.x=element_text(angle=0, size=rel(.8)),
axis.text.y=element_text(angle=0, size=rel(.8)),
plot.tag = element_text(size=rel(1), face = "bold", vjust=-1),
panel.grid = element_blank(), panel.border = element_rect(size=1),
legend.background = element_blank(),
plot.margin=margin(t=0,r=3,l=0,b=0, unit="pt"),
strip.background = element_rect(fill="white"),
legend.text = element_text(size=rel(0.8))
)
theme_set(theme_plot)
Reference sequence
reference_file <- "1_ReferenceFiles/KTrefORF_1662.fasta"
wildtypye_bstr <- readDNAStringSet(reference_file)
wildtype <- toupper(biostr_to_char(wildtypye_bstr))
wt_homopolymers <- detect_homopolymer_positions(wildtype)
wt_homopolymers <- wt_homopolymers[vapply(wt_homopolymers,length)>1]
pos_wt_homopolymers <- unlist(wt_homopolymers)
Experimental data for set of KlenTaq’s 7 variants
kt7_barcodes <- paste0("barcode",c("05","06","07","08","09","10","11"))
kt7_true_mutants <- data.frame(
Barcode = kt7_barcodes,
true_consensus = c("C867G G1120A C1214T G1316A",
"G23A C245T",
"G94C A378G A905G G1023T A1381T",
"T425A G846A C851T G1403A",
"G307A G490- T659A T675A G993A A1554G",
"A41T G331T G349A C772T C879T C1474A C1475T G1530-",
"G303A G376A C517G T857A T1056A G1550A"))
kt7_bc2mut <- c("#1 - 2sub","#2 - 4sub","#3 - 4sub","#4 - 5sub","#5 - 6sub","#6 - 7sub 1del","#7 - 5sub 1del")
names(kt7_bc2mut) <- paste0("barcode",c("06","05","08","07","11","10","09"))
mutant_sequences <- readDNAStringSet("1_ReferenceFiles/KT7_sequences_aligned.fasta") #known sequences of mutants (different from reference_sequence)
kt_true_mutations <- list()
for(i in 1:length(mutant_sequences)){
mutant_seq <- toupper(stringr::str_split(as.character(mutant_sequences[[i]]), pattern = "")[[1]])
index <- which (mutant_seq!= wildtype)
kt_true_mutations[[i]] <- data.frame(sequence_id=rep(names(mutant_sequences)[i], length(index)),
position = index, nucleotide =mutant_seq[index])
}
kt_true_mutations <- do.call(rbind,kt_true_mutations)
Others
subsets <- c(seq(3,9,by=1),seq(10,50,by=5))
n_repetitions <- 50
In a unique sequencing run we used a barcoding kit to measure several samples at the same time. The table below indicates the number of the barcode (that will be returned by guppy_barcoder) and it’s content
BarcodeID | VariantID | Mutations |
---|---|---|
BC01 | wildtype | |
BC05 | Variant 2 | C867G G1120A C1214T C1316A |
BC06 | Variant 1 | G23A C245T |
BC07 | Variant 4 | G94C A378G A905G G1023T A1381T |
BC08 | Variant 3 | T425A G846A C851T G1403A |
BC09 | Variant 7 | G307A T659A T675A G993A A1554G G490 |
BC10 | Variant 6 | A41T G331T G349A C772T A785G C879T C1474A C1475T G1530 |
BC11 | Variant 5 | G303A G376A C517G T857A T1056A G1550A |
Basecalling was done with the following command lines in bash (replace paths if needed)
## BASECALL USING GUPPY
guppy_basecaller -i 2_NanoporeRawReads/SmallSet_fast5/ -s 3_Analysis/SmallSet_SUP/ -c dna_r9.4.1_450bps_sup.cfg -x 'cuda:0' -r
#Merge all files in a unique file
cat 3_Analysis/SmallSet_SUP/pass/*fastq > 3_Analysis/KT7.fastq
# for example: cat SmallSet_SUP/pass/*fastq > SmallSet.fastq
#Filter sequences by length (1700 to 2100 bp)
mkdir 3_Analysis/KT7_1700_2100
awk 'NR % 4 ==2 && length($0) > 1700 && length($0) < 2100 {print f "\n" $0;getline;print;getline;print} {f=$0}' 3_Analysis/KT7.fastq > 3_Analysis/KT7_1700_2100/KT7_1700_2100.fastq
#Demultiplexing by Guppy
guppy_barcoder -i 3_Analysis/KT7_1700_2100/ -s 3_Analysis/KT7_1700_2100
#Merge files to one file per barcode
cat 3_Analysis/KT7_1700_2100/barcode01/*fastq >> 3_Analysis/KT7_1700_2100/barcode01.fastq
cat 3_Analysis/KT7_1700_2100/barcode05/*fastq >> 3_Analysis/KT7_1700_2100/barcode05.fastq
cat 3_Analysis/KT7_1700_2100/barcode06/*fastq >> 3_Analysis/KT7_1700_2100/barcode06.fastq
cat 3_Analysis/KT7_1700_2100/barcode07/*fastq >> 3_Analysis/KT7_1700_2100/barcode07.fastq
cat 3_Analysis/KT7_1700_2100/barcode08/*fastq >> 3_Analysis/KT7_1700_2100/barcode08.fastq
cat 3_Analysis/KT7_1700_2100/barcode09/*fastq >> 3_Analysis/KT7_1700_2100/barcode09.fastq
cat 3_Analysis/KT7_1700_2100/barcode10/*fastq >> 3_Analysis/KT7_1700_2100/barcode10.fastq
cat 3_Analysis/KT7_1700_2100/barcode11/*fastq >> 3_Analysis/KT7_1700_2100/barcode11.fastq
Align using minimap2 and create count files.
#!/bin/bash
minimap2 -ax map-ont --sam-hit-only 1_ReferenceFiles/KTrefamplicon.fasta 3_Analysis/KT7_1700_2100/barcode01.fastq > 3_Analysis/KT7_1700_2100/barcode01.sam
samtools view -S -b 3_Analysis/KT7_1700_2100/barcode01.sam > 3_Analysis/KT7_1700_2100/barcode01.bam #TRANSFORM TO BAM
samtools sort 3_Analysis/KT7_1700_2100/barcode01.bam -o 3_Analysis/KT7_1700_2100/barcode01.sorted.bam #SORT BAM FILE
for (( counter=5; counter<12; counter++ ))
do
if [ $counter -lt 10 ];
then
bc=0$counter
else
bc=$counter
fi
name=barcode$bc
minimap2 -ax map-ont --sam-hit-only 1_ReferenceFiles/KTrefamplicon.fasta 3_Analysis/KT7_1700_2100/$name.fastq >3_Analysis/KT7_1700_2100/$name.sam
samtools view -S -b 3_Analysis/KT7_1700_2100/$name.sam > 3_Analysis/KT7_1700_2100/$name.bam # TRANSFORM TO BAM
samtools sort 3_Analysis/KT7_1700_2100/$name.bam -o 3_Analysis/KT7_1700_2100/$name.sorted.bam # SORT BAM FILE
echo -n "$bc "
done
printf "\n"
pos_start <- 60
pos_end <- 1721
tic <- proc.time()
## TRAIN
#Forward
single_train <- single_train(bamfile= "3_Analysis/KT7_1700_2100/barcode01.sorted.bam",
output = "3_Analysis/KT7_1700_2100/barcode01_5d4" ,
mean.n.mutations = 5.4,verbose = T,
rates.matrix = mutation_rate,
refseq_fasta = reference_file,
pos_start = pos_start, pos_end = pos_end,
save_partial=T, save_final=T)
## EVALUATE
cl <- parallel::makeForkCluster(7)
doParallel::registerDoParallel(cl)
foreach(n = 5:11)%dopar%{
if(n<10){x=paste0("0",n)}else{x=n}
single_evaluate(bamfile = paste0("3_Analysis/KT7_1700_2100/barcode",x,".sorted.bam"),
single_fits = single_train,
output_file = paste0("3_Analysis/KT7_1700_2100/barcode",x,"_SINGLe.fastq"),
refseq_fasta= reference_file,
pos_start=pos_start,pos_end=pos_end,
verbose=T,gaps_weights="minimum",
save_original_scores = T,
save=T)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Compute error rate on wild type reads
tic <- proc.time()
reads_wt <- read.table(paste0(path_analysis_lib7,"barcode01_5d4_SINGLe_countsPNQ.txt"), header=T)
reads_wt <- reads_wt %>%
mutate(isWT= nucleotide==wildtype[pos]) %>%
dplyr::rename(Position=pos, Nucleotide=nucleotide, counts=count)
total_error_rate <- reads_wt %>% group_by(isWT) %>% dplyr::summarise(counts=sum(counts))
total_error_perc <- total_error_rate[1,2]/sum(total_error_rate$counts)*100
cat("Error rate:", as.numeric(round(total_error_perc,2)),"%")
## Qscore distribution
reads_wt_qscore_per_position <- reads_wt %>%
group_by(QUAL, isWT) %>%
dplyr::summarise(Counts=sum(counts))
## Errors per position
reads_wt_errors_by_position <- reads_wt %>%
group_by(Position, Nucleotide,isWT) %>%
dplyr::summarise(Counts=sum(counts)) %>%
mutate(n_plot = floor(Position/300))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Figure_1A <- ggplot(reads_wt_qscore_per_position,aes(x=QUAL,y=Counts/(10^5),col=isWT))+
geom_line(lwd=0.5)+
xlab(expression(Q[score]))+ylab(bquote('Counts'~(x10^5)))+
scale_color_manual(breaks=c(TRUE,FALSE), labels=c("Correct","Error"),values=c(green_tr,red_tr))+
theme_plot+
theme(legend.direction="vertical",
legend.position = c(.7,.8),
legend.spacing.y = unit(0.05,"line"),
legend.key.height = unit(0.5,"line"),
legend.key.width = unit(0.6,"line"),
legend.text=element_text(size=rel(.8)),
legend.title=element_blank(),
legend.background = element_blank(),
plot.margin = margin(t=0,r = 0,b = 0,l=0))+
labs(tag = "A")
Figure_1A
Figure_1A_norm <- ggplot(reads_wt_qscore_per_position %>% dplyr::group_by(isWT) %>% dplyr::mutate(NormCounts=Counts/sum(Counts)),aes(x=QUAL,y=NormCounts,col=isWT))+
geom_line(lwd=0.5)+
xlab(expression(Q[score]))+ylab('Normalized counts')+
scale_color_manual(breaks=c(TRUE,FALSE), labels=c("Correct","Error"),values=c(green_tr,red_tr))+
theme_plot+
theme(legend.direction="vertical",
legend.position = c(.7,.7),
legend.spacing.y = unit(0.05,"line"),
legend.key.height = unit(1,"line"),
legend.text=element_text(size=rel(1)),
legend.title=element_blank(),
legend.background = element_blank())
Figure_1A_norm
ggsave(Figure_1A_norm, filename = paste0(path_save_figures,"Figure1Asuppl.png"),width =8 ,height = 8,dpi='print',units = "cm")
Figure_1B <- ggplot(reads_wt_errors_by_position %>%
filter(!isWT & Nucleotide !="-" & Position >100 & Position < 150),
aes(x=Position,y=Counts,fill=Nucleotide))+
guides(fill=guide_legend(ncol=2))+
geom_col()+
scale_x_continuous(breaks=seq(100,150,by=25))+
theme_plot+
theme(legend.direction="horizontal",
legend.position = c(.3,.85),
legend.key.size = unit(.5,units = "line"),
legend.box = "vertical" ,
legend.background = element_blank(),
legend.text=element_text(size=rel(.8)),
legend.title=element_blank(),
plot.margin = margin(t=0,r = 0,b = 0,l=0)
)+
labs(tag = "B")
Figure_1B
Figure_S2 <- ggplot(reads_wt_errors_by_position %>% filter(!isWT & Nucleotide !="-"),
aes(x=Position, y=Counts,fill=Nucleotide))+
geom_col()+facet_wrap(~n_plot, ncol=1, scales="free_x")+
theme_plot+
theme(strip.background = element_blank(), strip.text.x = element_blank(),
legend.direction="horizontal", legend.position = "top",
rect= element_rect(fill = NULL, colour = NULL, size = 1,linetype = "solid",inherit.blank = FALSE)
)
Figure_S2
ggsave(Figure_S2, file=paste0(path_save_figures,"FigureS2.png"), dpi = 'print', width = 16, height=22,units="cm")
Choose example of fits
tic <- proc.time()
## Example of fit
data_fits <- read.table("3_Analysis/KT7_1700_2100/barcode01_5d4_SINGLe_fits.txt", header = T)
data_mut <- read.table("3_Analysis/KT7_1700_2100/barcode01_5d4_SINGLe_data.txt", header = T)
position <- 547 # Choose position to plot, 547 in manuscript
mut.base <- "A" # Choose nucleotide to plot, A in manuscript
str <- "+"
#Filter all data by position and base
data_chosen_pos_base <- data_mut %>%
filter(pos==position & nucleotide==mut.base & strand==str) %>%
filter(count>0 | counts.wt>0) %>%
mutate(ratio=counts.wt/(count+counts.wt),
ratio_scaled =counts.wt.scaled/(counts.wt.scaled+counts.scaled))
wt <- as.character(data_chosen_pos_base$wt.base[1])
#Filter all fits' data by position and base
data_fits_chosen_pos_base <- data_fits %>%
filter(pos==position & (nucleotide==mut.base | nucleotide==wt) & strand==str)
class_fit_prior <- vapply(data_fits_chosen_pos_base$priors, class)
if(any(class_fit_prior=="numeric")){
ind_out <- which(class_fit_prior=="numeric");
data_fits_chosen_pos_base <- data_fits_chosen_pos_base[-ind_out,]
}
#Construct fitting curves
quality_range <- 0:35 #* You'll draw the fitted curves for this Qscores (x-values)
curve_fitted_corrected <- data.frame(quality=quality_range,
value=glm.predict.(quality_range, slope = data_fits_chosen_pos_base$prior_slope,
intercept =data_fits_chosen_pos_base$prior_intercept))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Figure_1D <- ggplot(data_chosen_pos_base)+
geom_point(aes(x=QUAL, y=ratio_scaled), size=.5)+
geom_line(data=curve_fitted_corrected, aes(x=quality, y=value), linetype="dashed", color=single_color)+
scale_y_continuous(breaks=c(0,0.5,1), limits = c(0,1))+scale_x_continuous(limits=c(0,35))+
xlab(expression(Q[score]))+ylab(expression(N[correct]))+ggtitle("SINGLe")+
theme_plot+
theme(plot.title = element_text(size=8))+
labs(tag = "D")
Figure_1D
fit_nopriors <- stats::glm(data_chosen_pos_base$ratio~ data_chosen_pos_base$QUAL,family = "quasibinomial")
fit_nopriors_coeff <- stats::coefficients(fit_nopriors)
curve_fitted_corrected <- data.frame(quality=quality_range,
value=glm.predict.(quality_range, slope = fit_nopriors_coeff[2],
intercept =fit_nopriors_coeff[1]))
Figure_1C <- ggplot(data_chosen_pos_base, aes(x=QUAL, y=ratio))+geom_point()+
geom_line(data=curve_fitted_corrected, aes(x=quality, y=value), linetype="dashed", color=guppy_color)+
scale_y_continuous(breaks=c(0,0.5,1),limits = c(0,1))+scale_x_continuous(limits=c(0,35))+
xlab(expression(Q[score]))+ylab(expression(N[correct]))+ggtitle("Guppy")+
theme_plot+
theme(plot.title = element_text(size=8), plot.margin = margin(t=0.5, l=0.5))+
labs(tag = "C")
Figure_1C
Figure1 <- ( Figure_1A | Figure_1B ) / (Figure_1C | Figure_1D)
Figure1
ggsave(Figure1, file=paste0(path_save_figures,"Figure1.png"),
dpi = 'print', width = 8.5, height=8.5,units="cm")
tic <- proc.time()
reads_wt <- read.table(paste0(path_analysis_lib7,"barcode01_5d4_SINGLe_countsPNQ.txt"), header=T) %>%
dplyr::rename(Strand=strand) %>%
mutate(isWT = nucleotide==wildtype[pos])
# mean Qscore is similar
tot_counts_for <- sum(reads_wt %>% filter(Strand=="+") %>% select(count))
mean_qscore_forward <- reads_wt %>% filter(Strand=="+") %>% summarise( sum(count*QUAL)/tot_counts_for)
tot_counts_rev <-sum(reads_wt %>% filter(Strand=="-") %>% select(count))
mean_qscore_reverse <- reads_wt %>% filter(Strand=="-") %>% summarise( sum(count*QUAL)/tot_counts_rev)
# Qscore distributions are similar:
counts_hist <- reads_wt %>%
dplyr::group_by(QUAL,Strand) %>%
dplyr::summarise(counts=sum(count)) %>%
ungroup()%>%
mutate(Proportion = counts/sum(counts)) %>%
dplyr::rename(Qscore=QUAL) %>%
mutate(Strand =ifelse(Strand=="+", "Forward","Reverse"))
# Percentage of errors are similar
perc_errors <- reads_wt %>%
group_by(Strand, isWT) %>%
summarise(counts = sum(count)) %>%
mutate(isWT = c("yes","no")[2-as.numeric(isWT)])%>%
reshape2::dcast(formula = Strand ~ isWT) %>%
mutate(perc_error = no/(yes+no)*100)
# Percentages of errors per position are different
perc_errors_perpos <- reads_wt %>%
group_by(Strand, isWT,pos) %>%
summarise(counts = sum(count)) %>%
mutate(isWT = c("yes","no")[2-as.numeric(isWT)])%>%
reshape2::dcast(formula = Strand +pos ~ isWT) %>%
mutate(perc_error = no/(yes+no)*100)
perc_errors_perpos_cast <- reshape2::dcast(perc_errors_perpos,formula=pos~Strand)%>%
dplyr::rename(Forward="+", Reverse="-")
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
FigS11A <- ggplot(counts_hist, aes(x=Qscore, y=Proportion,linetype=Strand))+
geom_line()+theme(legend.position = c(.7,.7))
FigS11B <- ggplot(perc_errors_perpos_cast, aes(x=Forward,y=Reverse))+
geom_point(col=rgb(0,0,0,0.3))+
scale_x_log10(limits=c(00.1,100))+
scale_y_log10(limits=c(00.1,100))+
xlab("Error (%) - forward strand")+
ylab("Error (%) - reverse strand")
# Thus fits are different
all_fits <- read.table(paste0(path_analysis_lib7,"/barcode01_5d4_SINGLe_fits.txt"), header = T)
forward_fits <- all_fits %>% filter(strand=="+")
reverse_fits <- all_fits %>% filter(strand=="-")
qval <- seq(0,40,by=0.1)
par(mfrow=c(1,4), mar=c(4,4,1,1),mgp=c(2,.7,0), las=1)
first <- T
for(n in c(1278:1281)+4*7-1){
y_for <- glm.predict.(x=qval, slope = forward_fits$prior_slope[n], intercept = forward_fits$prior_intercept[n])
y_rev <- glm.predict.(x=qval, slope = reverse_fits$prior_slope[n], intercept = reverse_fits$prior_intercept[n])
name <- paste(forward_fits$pos[n], forward_fits$nucleotide[n])
if(first){
df <- rbind(data.frame(Qscore=qval,Sense="Forward",Fit=y_for,n=name),
data.frame(Qscore=qval,Sense="Reverse",Fit=y_rev,n=name))
first <- F
}else{
df <- rbind(df,
rbind(data.frame(Qscore=qval,Sense="Forward",Fit=y_for,n=name),
data.frame(Qscore=qval,Sense="Reverse",Fit=y_rev,n=name))
)}
}
plot_fits_examples <- ggplot(df, aes(x=Qscore,y=Fit,lty=Sense))+geom_line()+facet_wrap(~n,nrow=1)
FigS11up <- plot_grid(FigS11A,FigS11B+theme(plot.margin=margin(l=10,r=5.5,t=7,b=5.5)),labels=c("A","B"),hjust = 0.1,vjust=1)
FigS11C <- plot_grid(plot_fits_examples,labels="C",hjust = 0.1)
FigS11 <-
plot_grid(FigS11up, FigS11C, nrow=2,hjust = -1)
ggsave(paste0(path_save_figures,"FigS11_strandsense.png"),
width = 7.62,height = 5.68,units = "in",dpi = 300)
FigS11
For Guppy and SINGLe, compare how does the signal and noise change if we accept as truth only the nucleotides with a p_right higher than a cut off.
Computation (takes ~ 1min).
tic <- proc.time()
# Identify errors and correct reads (signal and noise)
n <- 0
for(bc_i in kt7_barcodes){
message('Proccessing ', bc_i, '\n')
n<- n+1
#Barcode info
actual_mutations <- kt_true_mutations%>% #mutations present in this barcode
filter(sequence_id==bc_i)%>% select(nucleotide, position)
actual_mutations <- actual_mutations[which(actual_mutations$nucleotide!="-"),]
#Load corrected data
seqs_single <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, bc_i,"_SINGLe.fastq"))
aux_seqs <- vapply(as.character(seqs_single),strsplit, split="")
aux_quals <- 1-as(quality(seqs_single),"NumericList")
aux_pos <- lapply(aux_seqs, seq_along)
aux_names <- c(vapply(names(seqs_single),rep, each=1662))
data_single <- data.frame(nucleotide = unlist(aux_seqs),
p_SINGLe=unlist(aux_quals),
position=unlist(aux_pos) ,
seq_id=aux_names)
rownames(data_single)<-NULL
#Load original data
bf <- Rsamtools::BamFile(paste0(path_analysis_lib7, bc_i,".bam"))
reads <- Rsamtools::scanBam(bf)
#keep sequences that start at least at pos_start
index <- which(reads[[1]]$pos<=pos_start)
reads[[1]] <- vapply(reads[[1]], function(x){x[index]})
seqs_guppy <- GenomicAlignments::sequenceLayer(reads[[1]]$seq,
reads[[1]]$cigar,to = "reference")
names(seqs_guppy) <- reads[[1]]$qname
scores_guppy <- GenomicAlignments::sequenceLayer(reads[[1]]$qual,
reads[[1]]$cigar,to = "reference")
names(scores_guppy) <- reads[[1]]$qname
#Fill with gaps at the end of sequences shorter than pos_end
index_short_sequences <- which(width(seqs_guppy)<pos_end)
for(i in index_short_sequences){
seqs_guppy[i] <- paste0(as.character(seqs_guppy[i]),
paste0(rep("-", pos_end-width(seqs_guppy[i])),
collapse = ""),
collapse = "")
scores_guppy[i] <- paste0(as.character(scores_guppy[i]),
paste0(rep("-", pos_end-width(scores_guppy[i])),
collapse = ""),
collapse = "")
}
seqs_guppy <- subseq(seqs_guppy, start=pos_start+1-reads[[1]]$pos,end=pos_end+1-reads[[1]]$pos)
scores_guppy <- subseq(scores_guppy, start=pos_start+1-reads[[1]]$pos,end=pos_end+1-reads[[1]]$pos)
aux_seqs <- vapply(as.character(seqs_guppy),strsplit, split="")
aux_quals <- 1-as(scores_guppy,"NumericList")
aux_pos <- lapply(aux_seqs, seq_along)
aux_names <- c(vapply(names(seqs_guppy),rep, each=1662))
data_guppy <- data.frame(seq_id=aux_names,
position=unlist(aux_pos),
nucleotide = unlist(aux_seqs),
p_Guppy=unlist(aux_quals)
)
rownames(data_guppy)<-NULL
data <- full_join(data_single,data_guppy, by=c("seq_id","nucleotide","position")) %>%
mutate(isWT = nucleotide==wildtype[position]) %>%
filter(isWT==0 & nucleotide!="-") %>%
group_by(position, nucleotide, p_Guppy,p_SINGLe) %>%
dplyr::summarise(counts=n())
signal_data_aux <- left_join(actual_mutations, data,by = c("nucleotide", "position"))
noise_data_aux <- anti_join(data, actual_mutations,by = c("nucleotide", "position"))
if(nrow(signal_data_aux)+nrow(noise_data_aux) != nrow(data)){stop('Check joins of data')}
#Store data
if(n==1){
signal_data <- signal_data_aux
noise_data <- noise_data_aux
}else{
signal_data <- rbind(signal_data,signal_data_aux)
noise_data <- rbind(noise_data, noise_data_aux)
}
rm(signal_data_aux, noise_data_aux, seqs_single, seqs_guppy, data, aux_seqs, aux_quals, aux_pos)
}
#Compute signal and noise for different cut-offs
p_cutoff_vec <- sort(c(0,10^seq(-10,-1, by=.5),seq(0.1,.99, by=0.01)))
signal_tp <- data.frame(matrix(NA, ncol=5, nrow=length(p_cutoff_vec)))
colnames(signal_tp) <- c("pcutoff", "Guppy", "SINGLe", "Guppy - weighted", "SINGLe - weighted")
signal_fn <- noise_tn <- noise_fp <- signal_tp
noise_data <- noise_data%>% ungroup()
signal_data <- signal_data %>% ungroup()
for(i in seq_along(p_cutoff_vec)){
p_cutoff <- p_cutoff_vec[i]
signal_tp_aux <- signal_data %>%
dplyr::summarise(counts_guppy = sum(counts[p_Guppy >= p_cutoff]),
counts_single = sum(counts[p_SINGLe >= p_cutoff]),
counts_guppy_w = sum( (counts*p_Guppy) [p_Guppy >= p_cutoff]),
counts_single_w = sum( (counts*p_SINGLe) [p_SINGLe >= p_cutoff])
)
signal_fn_aux <- signal_data %>%
dplyr::summarise(counts_guppy = sum(counts[p_Guppy < p_cutoff]),
counts_single = sum(counts[p_SINGLe < p_cutoff]),
counts_guppy_w = sum( (counts*p_Guppy) [p_Guppy < p_cutoff]),
counts_single_w = sum( (counts*p_SINGLe) [p_SINGLe < p_cutoff])
)
noise_fp_aux <- noise_data %>%
dplyr::summarise(counts_guppy = sum(counts[p_Guppy >= p_cutoff]),
counts_single = sum(counts[p_SINGLe >= p_cutoff]),
counts_guppy_w = sum( (counts*p_Guppy) [p_Guppy >= p_cutoff]),
counts_single_w = sum( (counts*p_SINGLe) [p_SINGLe >= p_cutoff])
)
noise_tn_aux <- noise_data %>%
dplyr::summarise(counts_guppy = sum(counts[p_Guppy < p_cutoff]),
counts_single = sum(counts[p_SINGLe < p_cutoff]),
counts_gupy_w = sum( (counts*p_Guppy) [p_Guppy < p_cutoff]),
counts_single_w = sum( (counts*p_SINGLe) [p_SINGLe < p_cutoff])
)
signal_tp[i,] <- c(p_cutoff, unlist(signal_tp_aux))
noise_fp [i,] <- c(p_cutoff, unlist(noise_fp_aux))
signal_fn[i,] <- c(p_cutoff, unlist(signal_fn_aux))
noise_tn [i,] <- c(p_cutoff, unlist(noise_tn_aux))
}
aux_tp <- melt(signal_tp, id.vars = "pcutoff", value.name = "signal_tp")
aux_fp <- melt(noise_fp, id.vars = "pcutoff", value.name = "noise_fp")
aux_fn <- melt(signal_fn, id.vars = "pcutoff", value.name = "signal_fn")
aux_tn <- melt(noise_tn, id.vars = "pcutoff", value.name = "noise_tn")
rates <- aux_tp %>% full_join(aux_fp,by = c("pcutoff", "variable"))%>% full_join(aux_fn,by = c("pcutoff", "variable"))%>%full_join(aux_tn,by = c("pcutoff", "variable"))
rates <- rates %>%
mutate(ratio = signal_tp / noise_fp)%>%
mutate(tpr = signal_tp / (signal_tp + signal_fn))%>%
mutate(fpr = noise_fp / (noise_fp + noise_tn))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Figure_2A <- ggplot(rates %>% filter(variable=="Guppy" | variable=="SINGLe"), aes(y=tpr,x=fpr, col=variable))+
geom_line()+
scale_color_manual(values=c(guppy_color,single_color), breaks = c("Guppy","SINGLe"))+
scale_x_continuous(breaks=c(0,0.5,1), limits = c(0,1))+xlab("FPR")+
scale_y_continuous(breaks=c(0,0.5,1), limits = c(0,1))+ylab("TPR")+
theme_plot+
theme(legend.title = element_blank(), legend.position = c(0.6,0.2),legend.spacing.y = unit(0,"cm"),
legend.key.size = unit(0.5,"lines"), plot.margin = unit(c(0,5.5,5.5,5.5),"points"))+
labs(tag = "A")
Figure_2B <- ggplot(rates %>% filter(variable=="Guppy - weighted" | variable=="SINGLe - weighted") %>% mutate(variable = ifelse(variable == "Guppy - weighted", "Guppy", "SINGLe")), aes(pcutoff, y=ratio, col=variable))+
geom_line()+
scale_y_continuous(breaks=seq(1,4))+ylab("Signal / Noise")+
scale_x_continuous(breaks=c(0,0.5,1), limits = c(0,1))+xlab(expression("Cut off on p"["right"]))+
scale_color_manual(values=c(guppy_color,single_color), breaks=c("Guppy", "SINGLe"), name="")+
theme_plot+theme(legend.position = c(0.3,0.85),legend.spacing.y = unit(0,"cm"),
legend.key.size = unit(0.5,"lines"), plot.margin = unit(c(0,5.5,5.5,5.5),"points"))+
labs(tag = "B")
Figure_2A
Figure_2B
Figure_Sup3 <- ggplot(rates %>% filter(variable=="Guppy" | variable=="SINGLe"), aes(pcutoff, y=ratio, col=variable))+
geom_line()+
scale_y_continuous(breaks=seq(1,4))+ylab("Signal / Noise")+
scale_x_continuous(breaks=c(0,0.5,1), limits = c(0,1))+xlab(expression("Cut off on p"["right"]))+
scale_color_manual(values=c(guppy_color,single_color), breaks=c("Guppy", "SINGLe"), name="")+
theme_plot+theme(legend.position = c(0.3,0.85),legend.spacing.y = unit(0,"cm"),
legend.key.size = unit(0.5,"lines"), plot.margin = unit(c(0,5.5,5.5,5.5),"points"))
Figure_Sup3
Compute convergence of consensus. It takes long.
General inputs
wildtype_homopolymers <-
c(F, wildtype[2:length(wildtype)] == wildtype[1:(length(wildtype)-1)]) |
c(wildtype[1:(length(wildtype)-1)] == wildtype[2:length(wildtype)] ,F)
pos_wt_homopolymers <- which(wildtype_homopolymers)
tic <- proc.time()
for(i in 1:7){
#Load corrected data
seqs_single <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, kt7_barcodes[i],"_SINGLe.fastq"))
aux_seqs <- vapply(as.character(seqs_single),strsplit, split="")
aux_quals <- 1-as(quality(seqs_single),"NumericList")
aux_pos <- lapply(aux_seqs, seq_along)
aux_names <- c(vapply(names(seqs_single),rep, each=1662))
data_single <- data.frame(nucleotide = unlist(aux_seqs),
p_SINGLe=unlist(aux_quals),
position=unlist(aux_pos) ,
SeqID=aux_names)
rownames(data_single)<-NULL
seqs_original <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, kt7_barcodes[i],"_SINGLe_original.fastq"))
aux_seqs <- vapply(as.character(seqs_original),strsplit, split="")
aux_quals <- 1-as(quality(seqs_original),"NumericList")
aux_pos <- lapply(aux_seqs, seq_along)
aux_names <- c(vapply(names(seqs_original),rep, each=1662))
data_guppy <- data.frame(SeqID=aux_names,
position=unlist(aux_pos),
nucleotide = unlist(aux_seqs),
p_Guppy=unlist(aux_quals)
)
rownames(data_guppy) <- NULL
data <- full_join(data_single,data_guppy, by=c("SeqID","nucleotide","position")) %>%
mutate(isWT = nucleotide==wildtype[position]) %>%
mutate(p_SINGLe=ifelse(is.na(p_SINGLe), 1, p_SINGLe))%>%
mutate(p_noweights=1)
#Compute consensus for each barcode using all sequences available and using the three available methods
file_out_consensus<- paste0(path_analysis_lib7,kt7_barcodes[i],"_ConsensusBySINGLE.txt")
if(file.exists(file_out_consensus)){file.remove(file_out_consensus)}
cat("Barcode\tnseqs\trepetition\tmutation\n", file=file_out_consensus,append=F)
pb <- txtProgressBar(0,length(subsets)*n_repetitions,style=3)
counter <- 0
bc_seqsID <- unique(data$SeqID)
for( s in subsets){
for (r in 1:n_repetitions) {
subset_bc_reads <- data %>%
filter(SeqID %in% sample(bc_seqsID, s))
cons_single <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_SINGLe,position),
cutoff_prob = 0)
cons_guppy <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_Guppy,position),
cutoff_prob = 0)
cons_noweight <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_noweights,position),
cutoff_prob = 0)
muts_single <- detect_mutations(biostr_to_char(cons_single), wildtype)
muts_guppy <- detect_mutations(biostr_to_char(cons_guppy), wildtype)
muts_noweight <- detect_mutations(biostr_to_char(cons_noweight), wildtype)
n_single <- length(muts_single)
n_guppy <- length(muts_guppy)
n_noweight <- length(muts_noweight)
n_tot <- n_single+n_guppy+n_noweight
df <- data.frame(barcode = rep(kt7_barcodes[i],n_tot),
nseqs=s,repetition=r,
mutation=c(muts_single,muts_guppy,muts_noweight),
method = c(rep("SINGLe", n_single),
rep("Guppy",n_guppy),
rep("No weights",n_noweight)))
write.table(df, append = T, col.names = F,row.names = F, file = file_out_consensus)
counter <- counter+1
setTxtProgressBar(pb,counter)
}
}
}
rm(data)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
tic <- proc.time()
for(i in 1:7){
#Load corrected data
seqs_single <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, kt7_barcodes[i],"_SINGLe.fastq"))
aux_seqs <- vapply(as.character(seqs_single),strsplit, split="")
aux_quals <- 1-as(quality(seqs_single),"NumericList")
aux_pos <- lapply(aux_seqs, seq_along)
aux_names <- c(vapply(names(seqs_single),rep, each=1662))
data_single <- data.frame(nucleotide = unlist(aux_seqs),
p_SINGLe=unlist(aux_quals),
position=unlist(aux_pos) ,
SeqID=aux_names)
rownames(data_single)<-NULL
seqs_original <- readQualityScaledDNAStringSet(paste0(path_analysis_lib7, kt7_barcodes[i],"_SINGLe_original.fastq"))
aux_seqs <- vapply(as.character(seqs_original),strsplit, split="")
aux_quals <- 1-as(quality(seqs_original),"NumericList")
aux_pos <- lapply(aux_seqs, seq_along)
aux_names <- c(vapply(names(seqs_original),rep, each=1662))
data_guppy <- data.frame(SeqID=aux_names,
position=unlist(aux_pos),
nucleotide = unlist(aux_seqs),
p_Guppy=unlist(aux_quals)
)
rownames(data_guppy) <- NULL
data <- full_join(data_single,data_guppy, by=c("SeqID","nucleotide","position")) %>%
mutate(isWT = nucleotide==wildtype[position]) %>%
mutate(p_SINGLe=ifelse(is.na(p_SINGLe), 1, p_SINGLe))%>%
mutate(p_noweights=1)
#Compute consensus for each barcode using all sequences available and using the three available methods
file_out_consensus<- paste0(path_analysis_lib7,kt7_barcodes[i],"_ConsensusBySINGLE_rmHomopolforVCS.txt")
if(file.exists(file_out_consensus)){file.remove(file_out_consensus)}
cat("Barcode\tnseqs\trepetition\tmutation\n", file=file_out_consensus,append=F)
pb <- txtProgressBar(0,length(subsets)*n_repetitions,style=3)
counter <- 0
bc_seqsID <- unique(data$SeqID)
for( s in subsets){
for (r in 1:n_repetitions) {
subset_bc_reads <- data %>%
filter(SeqID %in% sample(bc_seqsID, s))
index_homopolymers <- which(subset_bc_reads$position %in% pos_wt_homopolymers & subset_bc_reads$nucleotide=="-")
n_memory <- c()
for(n in index_homopolymers){
if(n %in% n_memory){next()}
hp_pos <- subset_bc_reads$position[n]
hp_pos_ref <- wt_homopolymers[vapply(wt_homopolymers, function(x){hp_pos %in% x})][[1]]
hp_vec_logical <- hp_pos_ref==hp_pos
hp_vec_length <- length(hp_vec_logical)
hp_vec <- rep(NA,hp_vec_length)
ind_aux <- which(hp_vec_logical)
hp_vec[1:ind_aux] <- rev(n+1-1:ind_aux)
if(ind_aux<hp_vec_length){
hp_vec[(ind_aux+1):hp_vec_length] <- n+c(1:(hp_vec_length-ind_aux))
}
n_memory <- hp_vec
data_aux <- subset_bc_reads[hp_vec,]
if(all(data_aux$nucleotide=="-")){next()}
pos <- data_aux$position
nucl <- data_aux$nucleotide
ind_order <- c(which(nucl!="-"),which(nucl=="-"))
subset_bc_reads$position[hp_vec] <- pos[order(ind_order)]
if(length( which(subset_bc_reads$position %in% pos_wt_homopolymers &
subset_bc_reads$nucleotide=="-"))<length(index_homopolymers)){
cat("n=",n,"\n")
stop()
}
}
subset_bc_reads <- subset_bc_reads %>% arrange(SeqID,position)
cons_single <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_SINGLe,position),
cutoff_prob = 0)
cons_guppy <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_Guppy,position),
cutoff_prob = 0)
cons_noweight <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_noweights,position),
cutoff_prob = 0)
muts_single <- detect_mutations(biostr_to_char(cons_single), wildtype)
muts_guppy <- detect_mutations(biostr_to_char(cons_guppy), wildtype)
muts_noweight <- detect_mutations(biostr_to_char(cons_noweight), wildtype)
n_single <- length(muts_single)
n_guppy <- length(muts_guppy)
n_noweight <- length(muts_noweight)
n_tot <- n_single+n_guppy+n_noweight
if(n_tot==0){
df <- data.frame(barcode = rep(kt7_barcodes[i],3),
nseqs=s,
repetition=r,
mutation="wildtype",
method = c("SINGLe", "Guppy","No weights"))
}else{
df <- data.frame(barcode = rep(kt7_barcodes[i],n_tot),
nseqs=s,
repetition=r,
mutation=c(muts_single,muts_guppy,muts_noweight),
method = c(rep("SINGLe", n_single),rep("Guppy",n_guppy), rep("No weights",n_noweight)))
}
write.table(df, append = T, col.names = F,row.names = F, file = file_out_consensus)
counter<- counter+1
setTxtProgressBar(pb,counter)
}
}
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Prepare fast5 files for nanopolish. Please indicate proper paths indicated in capital letters
tic <- proc.time()
PATH_TO_FAST5_FILES<- "2_NanoporeRawReads/SmallSet_fast5/" #PATH_TO_FAST5_FILES location of nanopore reads
dir.create(paste0(path_analysis_lib7,"fast5files/")) # individual fast5 files will be stored there temporally
# Split to individual files
split_fast5 <- paste0("multi_to_single_fast5 -i ",PATH_TO_FAST5_FILES, " -s ",path_analysis_lib7, "fast5files/")
system2(split_fast5)
input_table_barcodes <- read.table(paste0(path_analysis_lib7,"barcoding_summary.txt"),header=T)
input_table_barcodes <- input_table_barcodes %>%
filter(barcode_arrangement %in% kt7_barcodes) %>%
select(read_id,barcode_arrangement)
# Split individual files into folders according to barcode
individual_files <- dir(pattern="*.fast5",path=paste0(path_analysis_lib7,"fast5files/"),recursive=T)
individual_files <- data.frame(file =individual_files)
individual_files$read_id <- vapply(individual_files$file, function(x) {
sub(pattern = ".fast5",replacement = "",x=strsplit(x,split="/", fixed=T)[[1]][2])
})
individual_files <- left_join(individual_files,input_table_barcodes)
for(bc in kt7_barcodes){
dir.create(paste0(path_analysis_lib7,"fast5files/", bc))
barcode_files <- individual_files %>% filter(barcode_arrangement==bc)
commands <- paste0("mv ",path_analysis_lib7,"fast5files/", barcode_files$file," ", path_analysis_lib7,"fast5files/",bc,"/")
vapply(commands,system2)
}
# Create one file per barcode
dir.create(paste0(path_analysis_lib7, "fast5files_grouped"))
for(bc in kt7_barcodes){
dir.create(paste0(path_analysis_lib7, "fast5files_grouped/",bc))
command <- paste0("single_to_multi_fast5 -i ",path_analysis_lib7,"fast5files/", bc, " -s ", path_analysis_lib7,"fast5files_grouped/",bc, " -f ", bc, " -n ", 1000)
system2(command)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Create bash file to compute consensus by nanopolish and medaka
tic <- proc.time()
dir.create(paste0(path_analysis_lib7,"subsets/"))
wd <- getwd()
for(bc in kt7_barcodes){
cat(bc,"\n")
store_path <- paste0(path_analysis_lib7,"subsets/")
fastq_file <- readLines(paste0(path_analysis_lib7, bc,".fastq"))
name_lines <- grep(pattern="^@",x=fastq_file)
bash_file <- paste0(path_analysis_lib7,bc, ".sh")
cat("#!/bin/bash \n",file=bash_file, append=F)
cat(paste0("cd ", wd,"\n"), file=bash_file,append=T)
results_racon <- paste0(path_analysis_lib7,bc,"_ConsensusbyRacon.txt")
results_medaka <- paste0(path_analysis_lib7,bc,"_ConsensusbyMedaka.txt")
nanopolish_results <- paste0(path_analysis_lib7,bc,"_ConsensusbyNanopolish.txt")
file.create(results_racon,overwrite=TRUE)
file.create(results_medaka,overwrite=TRUE)
file.create(nanopolish_results,overwrite=TRUE)
for(ns in subsets){
cat("\t subset size",ns,"\n")
if(ns> length(name_lines)){next()}
for(r in 1:n_repetitions){
# cat("----------------",ns, r, "\n")
basename <- paste0(bc,"_",ns,"_",r)
filename <- paste0(store_path,basename)
fastq <- paste0(filename,".fastq")
## Make file of subset of sequences
choose_lines <- sample(name_lines, size= ns)
choose_lines_all <- sort(c(choose_lines,choose_lines+1,choose_lines+2,choose_lines+3))
writeLines(text = fastq_file[choose_lines_all], con = file(fastq))
sam <- paste0(filename,".sam")
sorted <- paste0(filename,".sorted.fasta")
racon <- paste0(filename,".racon.fasta")
medaka <- paste0(filename,"_medaka.fasta")
nanopolish <- paste0(filename,"_nanopolish.txt")
## Run minimap2
line_minimap <- paste0("minimap2 -ax map-ont ", reference_file, " ", fastq," > ", sam)
## Run racon
line_racon <- paste0("racon ",fastq," ",sam, " ",reference_file, " >> ", racon)
line_results_racon <- paste0("echo '>",basename, "' >> ",results_racon)
line_results_racon2 <- paste0("tail -n 1 ",racon, " >> ",results_racon)
line_remove_racon <- paste0("rm ", racon,".fai ",racon, ".mmi ",racon )
## Run Medaka
line_medaka <- paste0("medaka_consensus -i ",fastq," -d ",racon," -o ", medaka," -t 4 ")
line_results_medaka <- paste0("echo '>",basename, "' >> ",results_medaka)
line_results_medaka2 <- paste0("tail -n 1 ",filename,"_medaka.fasta/consensus.fasta", " >> ",results_medaka)
line_remove_medaka <- paste0("rm -r ",filename,"_medaka.fasta/")
## Run nanopolish
line_nanopolish_index <- paste0("nanopolish index -d ", path_analysis_lib7, "fast5files_grouped/",bc," ",fastq)
line_nanpolish_sort <- paste0(" samtools sort ",sam," -o ",filename,".sorted.fastq -T temporal.tmp ")
line_nanopolish_index2 <- paste0("samtools index ", filename,".sorted.fastq")
line_nanopolish_variants <- paste0("nanopolish variants --consensus -o ", nanopolish, " -r ",fastq," -b ",filename,".sorted.fastq -g ", reference_file)
line_results_nanopolish <- paste0("awk '!/^#/ {print \"",filename,"\\t\" $0}' ",nanopolish, " >> ",nanopolish_results)
line_remove_nanopolish <- paste0("rm ", filename,".fastq.index* ", filename,".sorted* ", nanopolish, " ", filename,".sam ")
cat(line_minimap,
line_racon,
line_results_racon,
line_results_racon2,
line_medaka,
line_results_medaka,
line_results_medaka2,
line_nanopolish_index,
line_nanpolish_sort,
line_nanopolish_index2,
line_nanopolish_variants,
line_results_nanopolish,
line_remove_racon,
line_remove_medaka,
line_remove_nanopolish,
file=bash_file, sep="\n", append=T)
}
}
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Run barcodeXX.sh bash files in a terminal with conda activate medaka:
cd 3_Analysis/KT7_1700_2100
chmod +x *sh
conda activate medaka
nohup ./barcode05.sh > bc05_out.log 2>bc05_err.log &
nohup ./barcode06.sh > bc06_out.log 2>bc06_err.log &
nohup ./barcode07.sh > bc07_out.log 2>bc07_err.log &
nohup ./barcode08.sh > bc08_out.log 2>bc08_err.log &
nohup ./barcode09.sh > bc09_out.log 2>bc09_err.log &
nohup ./barcode10.sh > bc10_out.log 2>bc10_err.log &
nohup ./barcode11.sh > bc11_out.log 2>bc11_err.log &
Run NextPolish
tic <- proc.time()
create_run_cfg <- function(run_cfg_file, ref_fasta, workdir){
cat("[General]\n", file=run_cfg_file, append=FALSE)
cat('job_type = local\n', file=run_cfg_file, append=TRUE)
cat('job_prefix = nextPolish\n', file=run_cfg_file, append=TRUE)
cat('task = best\n', file=run_cfg_file, append=TRUE)
cat('rewrite = yes\n', file=run_cfg_file, append=TRUE)
cat('rerun = 3\n', file=run_cfg_file, append=TRUE)
cat('parallel_jobs = 2\n', file=run_cfg_file, append=TRUE)
cat('multithread_jobs = 4\n', file=run_cfg_file, append=TRUE)
cat('genome =', ref_fasta,' #genome file\n', file=run_cfg_file, append=TRUE)
cat('genome_size = auto\n', file=run_cfg_file, append=TRUE)
cat('workdir =',workdir, "\n",file=run_cfg_file, append=TRUE, sep="")
cat(' polish_options = -p {multithread_jobs}\n', file=run_cfg_file, append=TRUE)
cat('[lgs_option]\n', file=run_cfg_file, append=TRUE)
cat('lgs_fofn =lgs.fofn\n', file=run_cfg_file, append=TRUE)
cat('lgs_options = -min_read_len 1k -max_depth 100\n', file=run_cfg_file, append=TRUE)
cat('lgs_minimap2_options = -x map-ont\n', file=run_cfg_file, append=TRUE)
return(0)
}
dir.create(paste0(path_analysis_lib7,"nextpolish/"))
system2(paste0("cp ", reference_file, " ",path_analysis_lib7,"nextpolish/"))
for(bc in kt7_barcodes){
save_file <- paste0(path_analysis_lib7,bc, "_ConsensusbyNextPolish.fasta")
for(ns in subsets){
cat("\t",ns,"\n")
if(ns> length(name_lines)){next()}
for(r in 1:n_repetitions){
cat("----------------",ns, r, "\n")
#Create file with subset of sequences
subset_name <- paste0(bc,"_",ns, "_",r)
filename_subset <- paste0(run_folder,subset_name,".fastq")
## Run nextpolish
system2(paste0("mv ", path_analysis_lib7,"subsets/", subset_name, ".fastq ", path_analysis_lib7,"nextpolish/"))
system2(paste0("echo '", subset_name,".fastq' >", path_analysis_lib7,"nextpolish/lgs.fofn"))
create_run_cfg(run_cfg_file=paste0(path_analysis_lib7,"nextpolish/run.cfg"),
ref_fasta="KTrefORF_1662.fasta",
workdir=paste0(subset_name,"/"))
system2(paste0("cd ",path_analysis_lib7,"nextpolish/ ; nextPolish run.cfg"))
system2(paste0("sed -i '1s/.*/>",subset_name,"'/ ",path_analysis_lib7,"nextpolish/",subset_name, "/genome.nextpolish.fasta "))
system2(paste0("cat ",path_analysis_lib7,"nextpolish/", subset_name, "/genome.nextpolish.fasta >>", save_file))
system2(paste0("rm -r ", path_analysis_lib7,"nextpolish/", subset_name))
system2(paste0("rm -r ", path_analysis_lib7,"nextpolish/", subset_name,".fastq"))
system2(paste0("rm -r ", path_analysis_lib7,"nextpolish/lgs.fofn"))
system2(paste0("rm -r ", path_analysis_lib7,"nextpolish/run.cfg"))
}
}
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Identify mutations
tic <- proc.time()
df_nextpolish <- data.frame(
Barcode=NA,
Nreads=NA,
repetition=NA,
mutation=NA
)
for(bc in kt7_barcodes){
np<- readDNAStringSet(paste0(path_analysis_lib7,bc, "_ConsensusbyNextPolish.fasta"))
ali <- pairwiseAlignment(pattern = np, subject = wildtypye_bstr)
for(i in seq_along(ali)){
ref <- biostr_to_char(subject(ali[i]))
seq <- biostr_to_char(pattern(ali[i]))
ind_noI <- which(ref!="-")
ref <- ref[ind_noI]
seq <- seq[ind_noI]
mutations <- detect_mutations(seq,ref)
ids <- str_split(names(np)[i], "_")[[1]]
if(length(mutations)==0){next()}
df_aux <- data.frame(Barcode=ids[1],
Nreads=as.numeric(str_remove(ids[2],"s")),
repetition=as.numeric(str_remove(ids[3],"r")),
mutation = mutations)
df_nextpolish <- rbind(df_nextpolish,df_aux)
}
}
df_nextpolish <- df_nextpolish %>% filter(!is.na(Barcode))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Re-shape results to a unique matrix
tic <- proc.time()
# Racon
racon_mismatches <- data.frame(Barcode=NA,Nreads=NA,repetition=NA,Position=NA,base.original=NA,base.mutated=NA,mutation=NA)
for(bc in kt7_barcodes){
racon_consensus <- readDNAStringSet(paste0(path_analysis_lib7,bc,"_ConsensusbyRacon.txt"))
for(i in seq_along(racon_consensus)){
name <- names(racon_consensus)[i]
info <- strsplit(split="_",x=name)[[1]]
racon <- racon_consensus[[i]]
ali <- pairwiseAlignment(wildtype_bstr,racon)
ref <- strsplit(as.character(pattern(ali)), split="")[[1]]
read <- strsplit(as.character(subject(ali)),split="")[[1]]
index <- which(ref!=read)
mismatches <- paste0(ref[index],index,read[index])
if(length(mismatches)>0){
df_aux <- data.frame(Barcode=bc,Nreads=info[2],repetition=info[3],Position=index,
base.original=ref[index],base.mutated=read[index],mutation=mismatches)
racon_mismatches <- racon_mismatches %>% rbind(df_aux)
}
}
}
racon_mismatches <- racon_mismatches %>% filter(!is.na(Barcode)) %>% mutate(method="Racon",homopolymers=NA)
# Medaka
medaka_mismatches <- data.frame(Barcode=NA,Nreads=NA,repetition=NA,Position=NA,base.original=NA,base.mutated=NA,mutation=NA)
for(bc in kt7_barcodes){
medaka_consensus <- readDNAStringSet(paste0(path_analysis_lib7,bc,"_ConsensusbyMedaka.txt"))
for(i in seq_along(medaka_consensus)){
name <- names(medaka_consensus)[i]
info <- strsplit(split="_",x=name)[[1]]
medaka <- medaka_consensus[[i]]
ali <- pairwiseAlignment(wildtype_bstr,medaka)
ref <- strsplit(as.character(pattern(ali)), split="")[[1]]
read <- strsplit(as.character(subject(ali)),split="")[[1]]
index <- which(ref!=read)
mismatches <- paste0(ref[index],index,read[index])
if(length(mismatches)>0){
df_aux <- data.frame(Barcode=bc,Nreads=info[2],repetition=info[3],Position=index,
base.original=ref[index],base.mutated=read[index],mutation=mismatches)
medaka_mismatches <- medaka_mismatches %>% rbind(df_aux)
}
}
}
medaka_mismatches <- medaka_mismatches %>% filter(!is.na(Barcode)) %>% mutate(method="Medaka",homopolymers=NA)
# Nanopolish
nanopolish_mismatches <- data.frame(Barcode=NA,Nreads=NA,repetition=NA,Position=NA,base.original=NA,base.mutated=NA)
for(bc in kt7_barcodes){
results_nanopolish <- system2(paste0("awk \'/^[^#]/ \' ",bc,"_ConsensusbyNanopolish.txt"), intern = T)
if(length(results_nanopolish)==0){next()}
aux_nanopolish <- do.call(rbind,vapply(results_nanopolish, strsplit, split="\t"))
colnames(aux_nanopolish) <- rownames(aux_nanopolish) <- NULL
aux_nanopolish <- aux_nanopolish[,c(1,3,5,6)]
info <- vapply(aux_nanopolish[,1],strsplit,split="_")
if(nrow(aux_nanopolish)>0){
aux_nanopolish <- data.frame(Barcode=bc,
Nreads=vapply(info, function(x){x[2]}),
repetition=vapply(info, function(x){x[3]}),
Position=aux_nanopolish[,2],
base.original=aux_nanopolish[,3],base.mutated=aux_nanopolish[,4])
nanopolish_mismatches <- nanopolish_mismatches %>% rbind(aux_nanopolish)
}
}
nanopolish_mismatches <- nanopolish_mismatches %>% filter(!is.na(Barcode)) %>%
mutate(Nreads=as.numeric(Nreads), repetition=as.numeric(repetition), Position=as.numeric(Position))
#remove insertions
ind_insertions <- nchar(nanopolish_mismatches$base.mutated)>1
nanopolish_mismatches$base.mutated[ind_insertions] <- vapply(nanopolish_mismatches$base.mutated[ind_insertions] , substr,1,1)
nanopolish_mismatches <- nanopolish_mismatches %>% filter(base.mutated!=base.original)
#split deletions
index_aux <- which(nchar(nanopolish_mismatches$base.original)==2)
nanopolish_mismatches$base.original[index_aux] <- substr(nanopolish_mismatches$base.original[index_aux],2,2)
nanopolish_mismatches$base.mutated[index_aux] <- "-"
nanopolish_mismatches$Position[index_aux] <- as.numeric(nanopolish_mismatches$Position[index_aux])+1
index_aux_3 <- which(nchar(nanopolish_mismatches$base.original)==3)
aux_df_1 <- nanopolish_mismatches[index_aux_3,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"
aux_df_2 <- nanopolish_mismatches[index_aux_3,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"
nanopolish_mismatches <- nanopolish_mismatches[-index_aux_3,]
nanopolish_mismatches <- nanopolish_mismatches %>% rbind(aux_df_1) %>% rbind(aux_df_2) %>% arrange(Barcode,Nreads,repetition,Position) %>% mutate(method="Nanopolish",homopolymers=NA, mutation=paste0(base.original,Position,base.mutated))
#SINGLe
single_mismatches <- lapply(kt7_barcodes,
function(x){
name=paste0(x,"_ConsensusBySINGLE.txt")
y = read.table(name, header=T,row.names=NULL)
return(y)})
colnames(single_mismatches) <- c("Barcode","Nreads","repetition","mutation","method")
single_mismatches$homopolymers <- "original"
single_mismatches_sortHP <- lapply(kt7_barcodes,
function(x){
name=paste0(x,"_ConsensusBySINGLE_50reps_rmHomopolforVCS.txt")
y = read.table(name, header=T,row.names=NULL)
return(y)})
colnames(single_mismatches_sortHP) <- c("Barcode","Nreads","repetition","mutation","method")
single_mismatches_sortHP$homopolymers = "sorted_at_VCS"
single_mismatches <- rbind(single_mismatches_sortHP,single_mismatches) %>%
mutate(base.original = str_sub(mutation,1,1),
base.mutated=str_sub(mutation,-1,-1),
position=str_sub(mutation,start=2,end=-2)) %>%
mutate(position = as.numeric(position)) %>% dplyr::rename(Position=position) %>%
mutate(method = recode(method,!!!setNames(c("SINGLe","Guppy","No weights"),c("single","nanopore","no_weights"))))
df_nextpolish <- df_nextpolish %>%
mutate(method="NextPolish") %>%
mutate(homopolymers=NA) %>%
mutate(base.original = str_sub(mutation, 1,1)) %>%
mutate(base.mutated = str_sub(mutation, -1,-1)) %>%
mutate(Position = str_sub(mutation,2,-2))
#Save
kt7_mismatches_subsets_allMethods <- rbind(medaka_mismatches,nanopolish_mismatches,single_mismatches, df_nextpolish)
write.table(kt7_mismatches_subsets_allMethods, file=paste0(path_analysis_lib7, "KT7_ConsensusConvergence.txt"))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Load results of previous chunks
kt7_mismatches_subsets_allMethods <- read.table(paste0(path_analysis_lib7, "KT7_ConsensusConvergence.txt"), header=T)
Compute averages
tic <- proc.time()
consensus_by_method <- kt7_mismatches_subsets_allMethods %>%
mutate(variant = recode(Barcode, !!!kt7_bc2mut))
#correct equivalent deletion
ind_aux <- which(consensus_by_method$mutation=="G489-")
consensus_by_method$mutation[ind_aux] <- "G490-"
consensus_by_method$Position[ind_aux] <- 490
consensusStrand_by_method <- consensus_by_method %>%
group_by(Barcode,variant,Nreads,repetition,method,homopolymers) %>%
summarise(consensus = paste(mutation, collapse=" ")) %>%
left_join(kt7_true_mutants, by="Barcode") %>%
mutate(correct = consensus==true_consensus)
n_correct_consensus_average <- consensusStrand_by_method %>%
group_by(Barcode,variant,Nreads,method,homopolymers) %>%
summarise(total_correct = sum(correct)) %>%
mutate(success_rate = total_correct/50)
nmismatches_by_method <- consensus_by_method %>%
group_by(Barcode,variant,Nreads,repetition, method,homopolymers) %>%
summarise(n_mismatches=n())
average_nmismatches_by_method <- nmismatches_by_method %>%
group_by(Barcode,variant,Nreads,method,homopolymers) %>%
summarise(mean=sum(n_mismatches)/50,median=median(n_mismatches))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
linetype_vector <- c("solid","solid","dashed","dashed","solid","solid")
names(linetype_vector) <- c("Medaka","Nanopolish","Guppy","No weights","SINGLe","NextPolish")
average_mismatch_to_plot <- average_nmismatches_by_method %>%
filter(method !="Racon") %>%
filter((method=="SINGLe" & homopolymers=="sorted_at_VCS") |
is.na(homopolymers) |
(method!="SINGLe" & homopolymers=="original") )
n_correct_to_plot <- n_correct_consensus_average %>% filter(method !="Racon") %>%
filter((method=="SINGLe" & homopolymers=="sorted_at_VCS") |is.na(homopolymers) | (method!="SINGLe" & homopolymers=="original") )
Figure_2C <- ggplot(average_mismatch_to_plot %>%filter(Barcode=="barcode09"),
aes(x=Nreads, y=mean, col=method,lty=method))+
geom_point(size=0.5)+geom_line()+
ylab("Mismatches")+
xlab("Reads")+xlim(c(0,20))+
scale_color_manual(values=colors_vector_capital)+
scale_linetype_manual(values=linetype_vector, guide="none")+
theme_plot+
theme(legend.position = "none",plot.margin = unit(c(0,5.5,5.5,5.5),"points"))+
labs(tag = "C")
Figure_2C
Figure_2D <- ggplot(n_correct_to_plot ,
aes(x=Nreads,y=success_rate,col=method, lty=method))+
geom_point(size=1)+
geom_line()+
guides(col=guide_legend(ncol=1))+
geom_hline(aes(yintercept=0.9),col=grey(.5),linetype="dashed")+
scale_color_manual(values=colors_vector_capital[-6])+
scale_linetype_manual(values=linetype_vector, guide="none")+
theme_plot+
scale_y_continuous(breaks=c(0,.5,1), limits = c(0,1))+
scale_x_continuous(breaks=c(10,30,50))+
theme(
legend.position = c(0.8,0.5),
legend.box.margin = margin(0,0,0,0),
legend.box.spacing=unit(1,"pt"))+
facet_wrap(~variant, ncol=3, dir = "v")+
labs(tag = "D",x="Reads",y="Success rate",col="Method")
Figure_2D
Figure2 <- grid.arrange(Figure_2A , Figure_2B ,Figure_2C, Figure_2D,
layout_matrix=matrix(c(1,2,3,4,4,4), byrow = T,nrow=2),
heights=c(1,3))
Figure2
ggsave(Figure2, file=paste0(path_save_figures,"Figure2.png"), dpi = 'print', width = 16, height=20,units="cm")
Compute TP/FP/TN/FN by method
tic <- proc.time()
kt7_mismatches_subsets_allMethods <- read.table(paste0(path_analysis_lib7, "KT7_ConsensusConvergence.txt"), header=T)
consensus_by_method <- kt7_mismatches_subsets_allMethods %>%
mutate(variant = recode(Barcode, !!!kt7_bc2mut))
data_mutations_local <- kt_true_mutations %>%
mutate(wt = wildtype[position]) %>%
mutate(mutation =paste0(wt,position,nucleotide)) %>%
dplyr::rename(Barcode=sequence_id)%>%
select(Barcode, mutation) %>%
mutate(true_mut =1) %>%
group_by(Barcode) %>%
mutate(n_muts = n())
a <- consensus_by_method %>%
filter(method!="Racon") %>%
filter((method=="SINGLe" & homopolymers=="sorted_at_VCS") |is.na(homopolymers) | (method!="SINGLe" & homopolymers=="original") )%>%
select(Barcode, Nreads, repetition, mutation, method) %>%
left_join(data_mutations_local %>% select(-n_muts), by=c("Barcode", "mutation")) %>%
mutate(true_mut = ifelse(is.na(true_mut), 0,true_mut)) %>%
left_join(data_mutations_local %>% select(-true_mut, - mutation) %>% unique(), by=c("Barcode"))%>%
mutate(variant = recode(Barcode, !!!kt7_bc2mut)) %>%
select(-Barcode) %>%
dplyr::rename(Method=method)
class_vec <- c("True mutations", "False mutations", "False wild type", "True wild type", "Detected mutations")
names(class_vec) <- c("true_pos","false_pos","false_neg","true_neg", "total_mut")
sum_a_melt_av <- a %>%
group_by(variant,Nreads, repetition,Method) %>%
summarise(total_mut = n(),
true_pos = sum(true_mut),
false_pos =sum(1-true_mut),
n_muts=mean(n_muts)) %>%
mutate(false_neg = n_muts-true_pos, true_neg=1662-total_mut-false_neg) %>%
reshape2::melt(c("variant","Nreads","repetition","Method")) %>%
group_by(variant,Nreads, Method, variable) %>%
dplyr::summarise(mean_value = mean(value))%>%
mutate(variable = recode(variable, !!!class_vec))
sum_sens <- sum_a_melt_av %>%
group_by(Nreads, variant, Method) %>%
reshape2::dcast(Nreads+ variant+Method~variable) %>%
mutate(sensitivity = `True mutations`/(`True mutations` + `False wild type`) ) %>%
mutate(specificity = `True wild type`/(`True wild type` + `False mutations`) )
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Figure_S6A <- ggplot(sum_a_melt_av %>% filter(Nreads <10 & variable != "n_muts" ),
aes(x=Nreads, y=mean_value, col=Method))+
geom_line()+
facet_grid(variable~variant, scales = "free_y", labeller = label_wrap_gen(width=10))+
xlab("Reads")+ylab("Mean of counts")+
theme_plot+
scale_color_manual(values=colors_vector_capital[-6])+
theme(legend.position = "none")+
ggtitle(label="A")
Figure_S6B <-ggplot(sum_sens %>% filter(Nreads <10 ),
aes(x=specificity, y=sensitivity, col=Method))+
facet_grid(~variant, labeller = label_wrap_gen(width=10))+
geom_point()+geom_line()+
theme_plot+
scale_color_manual(values=colors_vector_capital[-6])+
theme(legend.position = "bottom", strip.background = element_blank(),
strip.text.x = element_blank(),plot.margin = margin(t = 0,r=35,b = 0,l = 10))+
ggtitle(label="B")+
scale_x_continuous(breaks=c(0.990,1))+
scale_y_continuous(breaks=c(0.2,0.6,1.0))
Figure_S6 <- grid.arrange(Figure_S6A , Figure_S6B,
layout_matrix=matrix(c(1,2), byrow = T,nrow=2),
heights=c(2.5,1))
ggsave(Figure_S6,filename=paste0(path_save_figures,"FigureS6.png"), width=16, height = 18, units = "cm", dpi='print')
### Figure S4 (homopolymers) |
Homopolymers analysis |
```r Figure_S4 <- ggplot(n_correct_consensus_average %>% filter(method %in% c(“SINGLe”,“No weights”,“Guppy”)), aes(x=Nreads,y=success_rate,col=method,lty=homopolymers))+ geom_line()+facet_grid(method~variant)+ ylab(“Success rate”)+geom_hline(aes(yintercept=0.9),col=grey(.5), lty=“dashed”)+ xlab(“Reads”)+ scale_color_manual(values=colors_vector_capital[c(1,3,5)],name=“Method”)+ scale_linetype_manual(values=c(“solid”,“dashed”),name=“Homopolymers”,labels=c(“Original”,“Sorted”))+ scale_y_continuous(breaks=c(0,.5,1), limits = c(0,1))+scale_x_continuous(breaks=c(10,20,30,40,50))+ theme_plot+ theme(legend.position = “bottom”,legend.spacing.y = unit(0,“line”), legend.key.size = unit(1.5,“line”), plot.margin = unit(c(0,5.5,5.5,5.5),“points”), strip.background = element_rect(fill= “white”)) Figure_S4 |
ggsave(Figure_S4,filename=paste0(path_save_figures,“FigureS4.png”), width=17, height = 10, units = “cm”, dpi=‘print’) ``` |
file_out_consensus<- paste0(path_analysis_lib,"/KTLib_ConsensusSINGLe.txt")
file_out_consensus_topten <- paste0(path_analysis_lib,"KTLib_ConsensusbySINGLe_toptenBC.txt")
file_consensus_KTLib <- paste0(path_analysis_lib, "KTLib_consensus_table.txt")
Basecalling
guppy_basecaller -i 2_NanoporeRawReads/LargeSet_fast5 -s 3_Analysis/LargeSet_SUP -c dna_r9.4.1_450bps_sup.cfg -x 'cuda:0' -r
#Filter sequences by length (1700 to 2100 bp)
mkdir 3_Analysis/KTLib/
awk 'NR % 4 ==2 && length($0) > 1700 && length($0) <2100 {print f "\n" $0;getline;print;getline;print} {f=$0}' 3_Analysis/LargeSet_SUP/pass/*fastq > 3_Analysis/KTLib/KTLib_1700_2100.fastq
Keep reads with mean Qscore>15
seqsum_ktlib <- read.table("3_Analysis/LargeSet_SUP/sequencing_summary.txt", header=T) %>%
filter(passes_filtering & mean_qscore_template>15) %>%
select(read_id)
file_ktlib_fastq <- "3_Analysis/KTLib/KTLib_1700_2100.fastq"#LargeSet_1700_2100.fastq"
file_ktlib_q15_fastq <- "3_Analysis/KTLib/KTLib_1700_2100_Q15.fastq"
nlines <- as.numeric(str_split(system2(paste0("wc -l ",file_ktlib_fastq), intern=T), pattern=" ")[[1]][1])
connectionFile <- file(file_ktlib_fastq,"r")
n<-0
pb <- txtProgressBar(0,nlines,style = 3)
while(n < nlines){
line <- readLines(connectionFile,n=1)
id <- substr(line,2,37)
if(id %in% seqsum_ktlib$read_id){
cat(line,"\n",file = file_ktlib_q15_fastq, append=T, sep="")
line <- readLines(connectionFile,n=1)
cat(line,"\n",file = file_ktlib_q15_fastq, append=T, sep="")
line <- readLines(connectionFile,n=1)
cat(line,"\n",file = file_ktlib_q15_fastq, append=T, sep="")
line <- readLines(connectionFile,n=1)
cat(line,"\n",file = file_ktlib_q15_fastq, append=T, sep="")
}else{
line <- readLines(connectionFile,n=3)
}
n <- n+4
setTxtProgressBar(pb,n)
}
close(connectionFile)
Alignment using minimap2
minimap2 -ax map-ont 1_ReferenceFiles/KTB_digested.fasta 3_Analysis/KTLib/KTLib_1700_2100_Q15.fastq > 3_Analysis/KTLib/KTLib_1700_2100_Q15.sam
samtools view -S -b 3_Analysis/KTLib/KTLib_1700_2100_Q15.sam > 3_Analysis/KTLib/KTLib_1700_2100_Q15.bam #TRANSFORM TO BAM
samtools sort 3_Analysis/KTLib/KTLib_1700_2100_Q15.bam -o 3_Analysis/KTLib/KTLib_1700_2100_Q15.sorted.bam #SORT BAM FILE
SINGLe correction
tic <- proc.time()
pos_start <- 103
pos_end <- 1764
single_evaluate(bamfile = "3_Analysis/KTLib/KTLib_1700_2100_Q15.sorted.bam",
single_fits = single_train,
output_file = "3_Analysis/KTLib/KTLib_1700_2100_Q15_SINGLe.fastq",
refseq_fasta = reference_file,
pos_start=pos_start,pos_end=pos_end,
verbose=T,gaps_weights="minimum",save=TRUE,
save_original_scores=TRUE)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Barcodes detection. First create bash file.
tic <- proc.time()
## Inputs
bc_before <- "actggc"
bc_after <- "aagcc"
bc_before <- toupper(bc_before)
bc_after <- toupper(bc_after)
l <- nchar(bc_before)
length_bc <- 36
length_deltabc <- 2
inFile <- "KTLib_1700_2100_Q15.fastq"
out.file <- "KTLib_1700_2100_barcodes.txt"
code.file <- "3_Analysis/KTLib/FindBarcodes.sh"
bc_min <- length_bc-length_deltabc
bc_max <- length_bc+length_deltabc
## All possible variations of barcodes
bc_before_compl <- dna_reverse_complement_string(bc_before)
bc_after_compl <- dna_reverse_complement_string(bc_after)
bc_before_all <- bc_one_mutation(bc_before)
bc_before_compl_all <- bc_one_mutation(bc_before_compl)
bc_after_all <- bc_one_mutation(bc_after)
bc_after_compl_all <- bc_one_mutation(bc_after_compl)
bc_combinations <- rbind(data.frame(before=bc_before,after= bc_after, forw=TRUE),
data.frame(before=bc_before_all, after=bc_after, forw=TRUE),
data.frame(before=bc_before, after=bc_after_all, forw=TRUE),
data.frame(before=bc_after_compl, after=bc_before_compl, forw=FALSE),
data.frame(before=bc_after_compl, after=bc_before_compl_all, forw=FALSE),
data.frame(before=bc_after_compl_all, after=bc_before_compl, forw=FALSE))
## Main
if(file.exists(code.file)){file.remove(code.file)} ; file.create(code.file)
cat("#!/bin/bash\n\n",file=code.file)
cat("echo \"Name\tLineInFile\tBCsequence\tBCposition\tDirectionRead\" > ",out.file," \n", file=code.file, append=FALSE)
#2. Sequences with barcodes
for(i in 1:nrow(bc_combinations)){
bc_before <- as.character(bc_combinations$before[i])
bc_after <- as.character(bc_combinations$after[i])
forward <- bc_combinations$forw[i]
awk_in <- paste0("awk '")
awk_match <- paste0("NR%4==2 && match($0,/", bc_before, "([ACGT]{", bc_min,",",bc_max,"})", bc_after,"/)")
if(forward){
awk_print <- paste0(" {print a \"\t\" NR \"\t\" substr($0,RSTART+",l,",RLENGTH-",2*l,") \"\t\" RSTART+",l," \"\t+\" } {a=$0}' ",inFile, ">> ", out.file)
}else{
awk_print <- paste0(" {print a \"\t\" NR \"\t\" substr($0,RSTART+",l,",RLENGTH-",2*l,") \"\t\" RSTART+",l," \"\t-\" } {a=$0}' ",inFile, ">> ", out.file)
}
y_awk <- paste0(awk_in, awk_match, awk_print)
cat(y_awk,"\n", file=code.file, append=TRUE)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Then run bash file.
cd 3_Analysis/KTLib/
chmod +x FindBarcodes.sh
./FindBarcodes.sh
Filter barcodes that appear more than 3 times
tic <- proc.time()
barcodes_table <- read.table(paste0(path_analysis_lib,"KTLib_1700_2100_barcodes.txt"),header=T, sep="\t")
barcodes_counts <- barcodes_table %>% group_by(BCsequence) %>% summarise(counts=n()) %>% filter(counts>3)
barcodesID <- unique(barcodes_counts$BCsequence); n_bc <- length(barcodesID)
barcodes_table_fourcounts <- barcodes_table %>%
filter(BCsequence %in% barcodesID)%>%
mutate(readID=str_sub(Name, 2, 37)) ;
nrow_bc <- nrow(barcodes_table_fourcounts)
write.table(barcodes_table_fourcounts,file=paste0(path_analysis_lib,"KTLib_1700_2100_barcodes_4reads.txt"))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Load data
barcodes_4reads <- read.table(file=paste0(path_analysis_lib,"KTLib_1700_2100_barcodes_4reads.txt")) %>%
select(readID, BCsequence) %>% arrange(BCsequence)
barcodes_unique <- unique(barcodes_4reads$BCsequence)
Generate fasta and fastq files that we need
tic <- proc.time()
dir.create( paste0(path_analysis_lib, "fastq_per_barcode"))
dir.create( paste0(path_analysis_lib, "fasta_per_barcode"))
KTLib_Q15_seqs <- readQualityScaledDNAStringSet(paste0(path_analysis_lib,"KTLib_1700_2100_Q15.fastq"))
names(KTLib_Q15_seqs) <- str_sub(names(KTLib_Q15_seqs) , 1, 36)
for(s in 1:nrow(barcodes_4reads)){
seq_name <- barcodes_4reads$readID[s]
bc_name <- barcodes_4reads$BCsequence[s]
file_name_q <- paste0(path_analysis_lib, "fastq_per_barcode/", bc_name,".fastq")
file_name_a <- paste0(path_analysis_lib, "fasta_per_barcode/", bc_name,".fasta")
# which(names(KTLib_Q15_seqs)==seq_name)
writeQualityScaledXStringSet(KTLib_Q15_seqs[seq_name], filepath = file_name_q, append = T)
writeXStringSet(KTLib_Q15_seqs[seq_name], filepath = file_name_a, append = T)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Load corrected sequences
tic <- proc.time()
seqs_single_ktlib <- readQualityScaledDNAStringSet(paste0(path_analysis_lib,"/KTLib_1700_2100_Q15_SINGLe.fastq"))
aux_seqs <- vapply(as.character(seqs_single_ktlib),strsplit, split="")
aux_quals <- 1-as(quality(seqs_single_ktlib),"NumericList")
aux_pos <- lapply(aux_seqs, seq)
aux_names <- c(vapply(names(seqs_single_ktlib),rep, each=1662))
data_single_ktlib <- data.frame(nucleotide = unlist(aux_seqs),
p_SINGLe=unlist(aux_quals),
position=unlist(aux_pos),
readID=aux_names)
rownames(data_single_ktlib)<-NULL
seqs_guppy_ktlib <- readQualityScaledDNAStringSet(paste0(path_analysis_lib,"/KTLib_1700_2100_Q15_SINGLe_original.fastq"))
aux_seqs <- vapply(as.character(seqs_single_ktlib),strsplit, split="")
aux_quals <- 1-as(quality(seqs_single_ktlib),"NumericList")
aux_pos <- lapply(aux_seqs, seq)
aux_names <- c(vapply(names(seqs_single_ktlib),rep, each=1662))
data_guppy_ktlib <- data.frame(nucleotide = unlist(aux_seqs),
p_Guppy=unlist(aux_quals),
position=unlist(aux_pos),
readID=aux_names)
rownames(data_guppy_ktlib)<-NULL
data_single_ktlib <- data_single_ktlib %>%
left_join(data_guppy_ktlib, by = c("nucleotide", "position", "readID"))%>%
left_join(barcodes_4reads, by = "readID")%>%
filter(!is.na(BCsequence)) %>%
mutate(isWT = nucleotide==wildtype[position]) %>%
mutate(p_SINGLe=ifelse(is.na(p_SINGLe), 1, p_SINGLe))%>%
mutate(p_noweights=1)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
barcodes_unique <- unique(data_single_ktlib$BCsequence)
n_bc <- length(barcodes_unique)
# save.image(file='myEnvironment.RData')
# load('myEnvironment.RData')
By SINGLe (sorting homopolymers)
tic <- proc.time()
#Compute consensus for each barcode using all sequences available and using the three available methods
cat("Barcode;Nreads;mutations_by_single;mutations_by_nanopore;mutations_no_weights\n", file=file_out_consensus, append=F)
pb <- txtProgressBar(0,n_bc,style=3)
counter<-0
wt_homopolymers <- detect_homopolymer_positions(wildtype)
wt_homopolymers <- wt_homopolymers[vapply(wt_homopolymers,length)>1]
pos_wt_homopolymers <- unlist(wt_homopolymers)
for( bc in barcodes_unique){
bc_reads <- data_single_ktlib %>%
filter(BCsequence==bc)
index_homopolymers <- which(bc_reads$position %in% pos_wt_homopolymers & bc_reads$nucleotide=="-")
n_memory <- c()
for(n in index_homopolymers){
if(n %in% n_memory){next()}
hp_pos <- bc_reads$position[n]
hp_pos_ref <- wt_homopolymers[vapply(wt_homopolymers, function(x){hp_pos %in% x})][[1]]
hp_vec_logical <- hp_pos_ref==hp_pos
hp_vec_length <- length(hp_vec_logical)
hp_vec <- rep(NA,hp_vec_length)
ind_aux <- which(hp_vec_logical)
hp_vec[1:ind_aux] <- rev(n+1-1:ind_aux)
if(ind_aux<hp_vec_length){
hp_vec[(ind_aux+1):hp_vec_length] <- n+c(1:(hp_vec_length-ind_aux))
}
n_memory <- hp_vec
data_aux <- bc_reads[hp_vec,]
if(all(data_aux$nucleotide=="-")){next()}
pos <- data_aux$position
nucl <- data_aux$nucleotide
ind_order <- c(which(nucl!="-"),which(nucl=="-"))
bc_reads$position[hp_vec] <- pos[order(ind_order)]
if(length( which(bc_reads$position %in% pos_wt_homopolymers & bc_reads$nucleotide=="-"))<length(index_homopolymers)){
cat("n=",n,"\n")
stop()
}
}
bc_reads <- bc_reads %>% arrange(readID,position)
cons_single <- weighted_consensus(df = bc_reads %>%
select(nucleotide,p_SINGLe,position),
cutoff_prob = 0)
cons_nanopore <- weighted_consensus(df = bc_reads %>%
select(nucleotide,p_Guppy,position),
cutoff_prob = 0)
cons_noweight <- weighted_consensus(df = bc_reads %>%
select(nucleotide,p_noweights,position),
cutoff_prob = 0)
muts_single <- c(detect_mutations(biostr_to_char(cons_single), wildtype))
muts_nanopore <- c(detect_mutations( biostr_to_char(cons_nanopore), wildtype))
muts_noweight <- c(detect_mutations( biostr_to_char(cons_noweight), wildtype))
n_single <- length(muts_single)
n_nanopore <- length(muts_nanopore)
n_noweight <- length(muts_noweight)
if(n_single==0){n_single=1;muts_single="wildtype"}
if(n_nanopore==0){n_nanopore=1;muts_nanopore="wildtype"}
if(n_noweight==0){n_noweight=1;muts_noweight="wildtype"}
n_tot = n_single+n_nanopore+n_noweight
df <- data.frame(barcode = rep(bc,n_tot),nseqs=length(unique(bc_reads$readID)),
mutation=c(muts_single,muts_nanopore,muts_noweight),
method = c(rep("single", n_single),
rep("nanopore",n_nanopore),
rep("no_weights",n_noweight)))
write.table(df, append = T, col.names = F,row.names = F, file = file_out_consensus)
counter <- counter+1
setTxtProgressBar(pb,counter)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
# save.image(file='myEnvironment.RData')
Prepare fast5 files for Nanopolish. Please indicate proper paths indicated in capital letters
tic <- proc.time()
dir.create(paste0(path_analysis_lib,"/fast5files/") )# individual fast5 files will be stored there temporally
split_fast5 <- paste0("multi_to_single_fast5 -i 2_NanoporeRawReads/LargeSet_fast5 -s 3_Analysis/KTLib/fast5files/") #PATH_TO_FAST5_FILES location of nanopore reads
system2(split_fast5)
# Place them in folders according to barcode
single_files <- dir(paste0(path_analysis_lib,"/fast5files/"), full.names = T, recursive = T)
single_files_name <- dir(paste0(path_analysis_lib,"/fast5files/"), full.names = F, recursive = T)
single_files <- data.frame(file=single_files, readID=single_files_name) %>%
mutate(readID = sub(readID, pattern=".fast5",replacement="")) %>%
mutate(readID = sub(readID, pattern="[[:digit:]]+/", replacement=""))
barcodes_table_fourcounts <- barcodes_table %>% left_join(single_files, by="readID")
nrow_bc <- nrow(barcodes_table_fourcounts)
pb <- txtProgressBar(min = 0, max = nrow_bc, style = 3)
for(i in 1:nrow_bc){
setTxtProgressBar(pb,i)
seqid <- barcodes_table_fourcounts$readID[i]
barcode <- barcodes_table_fourcounts$BCsequence[i]
barcode_dir <- paste0(path_analysis_lib,"fast5files/",barcode)
file <- barcodes_table_fourcounts$file[i]
if(!dir.exists(barcode_dir)){ dir.create(barcode_dir)}
command <- paste0("cp ", file, " ",barcode_dir)
system2(command)
}
# Create unique fast5 file per barcode
pb <- txtProgressBar(min = 0, max = nrow_bc, style = 3)
for(bc in barcodesID){
setTxtProgressBar(pb,bc)
command <- paste0("single_to_multi_fast5 -i 3_Analysis/KTLib/fast5files/",bc," -s 3_Analysis/KTLib/fast5files_byBC/ -f ",bc)
system2(command)
}
barcodes_table_fourcounts <- barcodes_table_fourcounts %>% select(-file)
rm(single_files,path_fast5,path_fast5_individual,path_fast5_individual_out)
### Create fastq file for interesting barcodes
dir.create(paste0(path_analysis_lib,"/fastq/"))
fastq_file <- readLines(paste0(path_analysis_lib,"/KTLib_1700_2100_Q15.fastq"))
n_fastq <- length(fastq_file)
for(i in seq(1,n_fastq, by=4)){
ind <- grep(pattern = substr(fastq_file[i],2,37),x=barcodes_table_fourcounts$SeqID)
if(length(ind)==0){next(i,"not found \n")}
write(fastq_file[c(i,i+1,i+2,i+3)],
file=paste0(path_analysis_lib,"/fastq/",barcodes_table_fourcounts$BCsequence[ind],".fastq"),
append=T)
}
### Create fasta from fastq
fastq_files <- dir(paste0(path_analysis_lib,"/fastq/"), full.names = T, pattern="*.fastq")
fasta_files <- str_replace_all(fastq_files, "fastq", "fasta")
dir.create(paste0(path_analysis_lib,"/fasta/"))
commands <- paste0("seqtk seq -a ", fastq_files, " > ",fasta_files)
vapply(commands, system2)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Compute using nanopolish and medaka. first generate bash file
# Load data
tic <- proc.time()
#outputs
bash_file <- "ConsensusByMethods.sh"
results_medaka <- paste0(path_analysis_lib,"ConsensusbyMedaka.txt")
nanopolish_results <- paste0(path_analysis_lib,"Consensus_by_Nanopolish.txt")
for(bc in barcodes_unique){
#all filenames
filename <- bc
fasta <- paste0(path_analysis_lib,"fasta/",bc,".fasta")
fastq <- paste0(path_analysis_lib,"fastq/",bc,".fastq")
sam <- paste0(path_analysis_lib,bc,".sam")
sorted <- paste0(path_analysis_lib,bc,".sorted.fasta")
racon <- paste0(path_analysis_lib,bc,".racon.fasta")
medaka <- paste0(path_analysis_lib,bc,"_medaka.fasta")
nanopolish <- paste0(path_analysis_lib,bc,"_nanopolish.txt")
## Run minimap2
line_minimap <- paste0("minimap2 -ax map-ont ", reference_file, " ", fastq," > ", sam)
## Run racon
line_racon <- paste0("racon ",fastq," ",sam, " ",reference_file, " >> ", racon)
line_remove_racon <- paste0("rm ", racon,".fai ",racon, ".mmi ",racon )
## Run Medaka
line_medaka <- paste0("medaka_consensus -i ",fastq," -d ",racon," -o ", medaka," -t 4 ")
line_results_medaka <- paste0("echo '>",filename, "' >> ",results_medaka)
line_results_medaka2 <- paste0("tail -n 1 ",filename,"_medaka.fasta/consensus.fasta", " >> ",results_medaka)
line_remove_medaka <- paste0("rm -r ",filename,"_medaka.fasta/")
## Run nanopolish
line_nanopolish_index <- paste0("nanopolish index -d ",bc,"/ ",fastq)
line_nanopolish_sort <- paste0("samtools sort ",sam," -o ",sorted," -T temporal.tmp ")
line_nanopolish_index2 <- paste0("samtools index ", sorted)
line_nanopolish_variants <- paste0("nanopolish variants --consensus -o ", nanopolish, " -r ",fastq," -b ",sorted," -g ", reference_file)
line_results_nanopolish <- paste0("awk '!/^#/ {print \"",filename,"\\t\" $0}' ",nanopolish, " >> ",nanopolish_results)
line_remove_nanopolish <- paste0("rm ", fastq,".index* ", filename,".sorted* ", nanopolish, " ", sam)
cat(line_minimap,
line_racon,
line_medaka,
line_results_medaka,
line_results_medaka2,
line_nanopolish_index,
line_nanopolish_sort,
line_nanopolish_index2,
line_nanopolish_variants,
line_results_nanopolish,
line_remove_racon,
line_remove_medaka,
line_remove_nanopolish,
file=bash_file, sep="\n", append=T)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Then run it
chmod +x ConsensusByMethods.sh
./ConsensusByMethods.sh
Align consensus sequences to reference
minimap2 -ava-ont 1_ReferenceFiles/KTrefORF_1662.fasta ConsensusbyMedaka.fasta > ConsensusbyMedaka_aligned.sam
minimap2 -ava-ont 1_ReferenceFiles/KTrefORF_1662.fasta ConsensusbyRacon.fasta > ConsensusbyRacon_aligned.sam
Parse results into a table consensus_mismatches
tic <- proc.time()
pos_initial <- 1
pos_final <- length(wildtype)
cigar_reference_counts <- c("M","D","N","=","X")
cigar_query_counts <- c("M","I","S","=","X")
sam_data_medaka <- read.table(paste0(path_analysis_lib,"ConsensusbyMedaka_aligned.sam"), skip=2)
medaka_mismatches <- data.frame(Barcode=NA,Position=NA,base.original=NA,base.mutated=NA)
for(i in 1:nrow(sam_data_medaka)){
line_data <- unlist(sam_data_medaka[i,])
pos_ini_aliref <- as.numeric(line_data[4])
cigar <- line_data[6]
sequence <- strsplit(line_data[10],split="")[[1]]
#Obtain CIGAR vector
cigar.table <- data.frame(n=as.numeric(strsplit(cigar, split="[MIDNSHP=X]")[[1]]),Op= strsplit(cigar, split="[0-9]+")[[1]][-1])
cigar_vec <- unlist(apply(cigar.table,1, function(x){rep(x[2],x[1])}))
names(cigar_vec) <- NULL
if(pos_ini_aliref>1){cigar_vec <- c(rep("D",pos_ini_aliref-1),cigar_vec)}
#Data frame for the original sequence, and reference for the model (full)
mismatches <- data.frame(Barcode =rep(line_data[1], length(cigar_vec)),
cigar=cigar_vec,
base.mutated=NA,pos_in_samalig=NA,Position=NA,base.original=NA)
rows_with_query <- mismatches$cigar%in%cigar_query_counts
rows_with_reference <- mismatches$cigar%in%cigar_reference_counts
mismatches$base.mutated[rows_with_query] <- sequence #Nucleotide
mismatches$base.mutated[is.na(mismatches$base.mutated)] <- "-"
mismatches$pos_in_samalig[rows_with_reference] <- pos_ini_aliref + c(0:(sum(rows_with_reference)-1)) #position - according to aligning reference
mismatches$Position[rows_with_reference] <- 1:length(wildtype) #position - according to single reference
mismatches$base.original[rows_with_reference] <- wildtype
index <- which(mismatches$base.mutated!=mismatches$base.original)
mismatches <- mismatches[index,c("Barcode", "Position","base.original","base.mutated")]
medaka_mismatches <- medaka_mismatches %>% rbind(mismatches)
}
medaka_mismatches <- medaka_mismatches %>% filter(!is.na(Barcode)) %>% mutate(method="Medaka",mutation=paste0(base.original,Position,base.mutated))
sam_data_racon <- read.table("ConsensusbyRacon_aligned.sam", skip=2)
racon_mismatches <- data.frame(Barcode=NA,Position=NA,base.original=NA,base.mutated=NA)
for(i in 1:nrow(sam_data_racon)){
line_data <- unlist(sam_data_racon[i,])
pos_ini_aliref <- as.numeric(line_data[4])
cigar <- line_data[6]
sequence <- strsplit(line_data[10],split="")[[1]]
#Obtain CIGAR vector
cigar.table <- data.frame(n=as.numeric(strsplit(cigar, split="[MIDNSHP=X]")[[1]]),Op= strsplit(cigar, split="[0-9]+")[[1]][-1])
cigar_vec <- unlist(apply(cigar.table,1, function(x){rep(x[2],x[1])}))
names(cigar_vec) <- NULL
if(pos_ini_aliref>1){cigar_vec <- c(rep("D", pos_ini_aliref-1), cigar_vec)}
#Data frame for the original sequence, and reference for the model (full)
mismatches <- data.frame(Barcode =rep(line_data[1], length(cigar_vec)),
cigar=cigar_vec,
base.mutated=NA,pos_in_samalig=NA,Position=NA,base.original=NA)
rows_with_query <- mismatches$cigar%in%cigar_query_counts
rows_with_reference <- mismatches$cigar%in%cigar_reference_counts
mismatches$base.mutated[rows_with_query] <- sequence #Nucleotide
mismatches$base.mutated[is.na(mismatches$base.mutated)] <- "-"
mismatches$pos_in_samalig[rows_with_reference] <- pos_ini_aliref + c(0:(sum(rows_with_reference)-1)) #position - according to aligning reference
mismatches$Position[rows_with_reference] <- 1:length(wildtype) #position - according to single reference
mismatches$base.original[rows_with_reference] <- wildtype
index <- which(mismatches$base.mutated!=mismatches$base.original)
mismatches <- mismatches[index,c("Barcode", "Position","base.original","base.mutated")]
racon_mismatches <- racon_mismatches %>% rbind(mismatches)
}
racon_mismatches <- racon_mismatches %>% filter(!is.na(Barcode)) %>% mutate(method="Racon", mutation=paste0(base.original,Position,base.mutated))
results_nanopolish <- readLines("ConsensusbyNanopolish.txt")
results_nanopolish <- do.call(rbind,vapply(results_nanopolish, strsplit, split="\t"))
colnames(results_nanopolish) <- rownames(results_nanopolish) <- NULL
results_nanopolish <- results_nanopolish[,c(1,3,5,6)]
colnames(results_nanopolish) <- c("Barcode","Position","base.original","base.mutated")
results_nanopolish <- as.data.frame(results_nanopolish)
results_nanopolish$Position <- as.numeric(results_nanopolish$Position)
results_nanopolish <- results_nanopolish %>% filter(!is.na(Barcode))
#remove insertions
ind_insertions <- nchar(results_nanopolish$base.mutated)>1
results_nanopolish$base.mutated[ind_insertions] <- vapply(results_nanopolish$base.mutated[ind_insertions] , substr,1,1)
results_nanopolish <- results_nanopolish %>% filter(base.mutated!=base.original)
#Split deletions
index_aux <- which(nchar(results_nanopolish$base.original)==2)
results_nanopolish$base.original[index_aux] <- substr(results_nanopolish$base.original[index_aux],2,2)
results_nanopolish$base.mutated[index_aux] <- "-"
results_nanopolish$Position[index_aux] <- as.numeric(results_nanopolish$Position[index_aux])+1
index_aux_3 <- which(nchar(results_nanopolish$base.original)==3)
aux_df_1 <- results_nanopolish[index_aux_3,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"
aux_df_2 <- results_nanopolish[index_aux_3,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"
results_nanopolish <- results_nanopolish[-index_aux_3,]
results_nanopolish <- results_nanopolish %>% rbind(aux_df_1) %>% rbind(aux_df_2)
rm(aux_df_1,aux_df_2)
index_aux_4 <- which(nchar(results_nanopolish$base.original)==4)
aux_df_1 <- results_nanopolish[index_aux_4,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"
aux_df_2 <- results_nanopolish[index_aux_4,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"
aux_df_3 <- results_nanopolish[index_aux_4,]
aux_df_3$Position <- as.numeric(aux_df_3$Position)+3
aux_df_3$base.original <- substr(aux_df_3$base.original, 4,4)
aux_df_3$base.mutated <- "-"
results_nanopolish <- results_nanopolish[-index_aux_3,]
results_nanopolish <- results_nanopolish %>% rbind(aux_df_1) %>% rbind(aux_df_2)%>% rbind(aux_df_3) %>% arrange(Barcode,Position) %>% mutate(method="Nanopolish", mutation=paste0(base.original,Position,base.mutated))
save(results_nanopolish, medaka_mismatches, racon_mismatches, file=paste0(path_consensus,"Mismatches_by_Method.Rdata"))
single_mismatches <- read.table("ConsensusBySINGLE.txt", header=F,skip=1)
colnames(single_mismatches) <- colnames(single_mismatches_sortHPinVCS) <- c("Barcode","Nreads","mutation","method")
single_mismatches <- single_mismatches %>%
mutate(base.original = stringr::str_sub(mutation,1,1)) %>%
mutate(base.mutated = stringr::str_sub(mutation,-1,-1)) %>%
mutate(Position = stringr::str_sub(mutation,2,-2)) %>%
select(Barcode,base.original,base.mutated,Position,method,mutation)
KTLib_mutations <- rbind(single_mismatches,single_mismatches_sortHPinVCS, results_nanopolish, medaka_mismatches,racon_mismatches)
write.table(KTLib_mutations, file=file_consensus_KTLib)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Identify ten most frequent barcodes
tic <- proc.time()
KTLib_topBC <- barcodes_4reads %>%
group_by(BCsequence) %>%
summarise(n=n()) %>%
arrange(-n)%>%
head(n=10)
KTLib_topBC_reads<- barcodes_4reads %>%
filter(BCsequence %in% KTLib_topBC$BCsequence)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Create a fastq file for each of them
tic <- proc.time()
for(i in unique(KTLib_topBC$BCsequence)){
reads_bc_i <- KTLib_topBC_reads %>% filter(BCsequence==i)
sequences_bc_i <- seqs_single_ktlib [names(seqs_single_ktlib) %in% reads_bc_i$readID]
writeQualityScaledXStringSet(sequences_bc_i, file=paste0(path_analysis_lib, i, ".fastq"))
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Compute consensus on subsets of reads by SINGLE
tic <- proc.time()
#Load corrected data
data_single_ktlib_topten <- data_single_ktlib %>%
filter(Name %in% KTLib_topBC_reads$Name )
# # Load data
# reads_corrected <- read.table("3_PreprocessedData/SINGLed_data/LargeSet_F_corrected.txt", header=T)
# reads_corrected <- reads_corrected %>% filter(SeqID %in% KTLib_BC_reads$SeqID )
# reads_corrected_rev <- read.table("3_PreprocessedData/SINGLed_data/LargeSet_R_corrected.txt", header=T)
# reads_corrected_rev <- reads_corrected_rev %>% filter(SeqID %in% KTLib_BC_reads$SeqID )
# reads_corrected_all <- rbind(reads_corrected,reads_corrected_rev)
#
# reads_corrected <- left_join(reads_corrected_all, KTLib_topBC_reads %>% select(SeqID,BCsequence) , by="SeqID")
# reads_corrected <- reads_corrected %>% filter(!is.na(reads_corrected$BCsequence))
# reads_corrected$p_right_priors_model[is.na(reads_corrected$p_right_priors_model)]<-1
#Compute consensus for each barcode using all sequences available and using the three available methods
cat("Barcode\tNreads\trepetition\tmutation\tmethod\n", file=file_out_consensus_topten, append=F)
barcodes_unique <- unique(data_single_ktlib_topten$BCsequence)
n_bc <- length(barcodes_unique)
for(bc in barcodes_unique){
cat(bc, "\n")
bc_reads <- data_single_ktlib_topten %>%
filter(BCsequence==bc)
#compute consensus on subsets
bc_seqsID <- unique(bc_reads$readID)
for( s in subsets){
cat(s, "\t")
counter<- 0
pb <- txtProgressBar(0,n_repetitions,style=3)
for (r in 1:n_repetitions) {
counter <- counter+1
setTxtProgressBar(pb,counter)
subset_bc_reads <- bc_reads %>%
filter(readID %in% sample(bc_seqsID, s))
#sort homopolymers
index_homopolymers <- which(subset_bc_reads$position %in% pos_wt_homopolymers
& subset_bc_reads$nucleotide=="-")
n_memory <- c()
for(n in index_homopolymers){
if(n %in% n_memory){next()}
hp_pos <- subset_bc_reads$position[n]
hp_pos_ref <- wt_homopolymers[vapply(wt_homopolymers, function(x){hp_pos %in% x})][[1]]
hp_vec_logical <- hp_pos_ref==hp_pos
hp_vec_length <- length(hp_vec_logical)
hp_vec <- rep(NA,hp_vec_length)
ind_aux <- which(hp_vec_logical)
hp_vec[1:ind_aux] <- rev(n+1-1:ind_aux)
if(ind_aux<hp_vec_length){
hp_vec[(ind_aux+1):hp_vec_length] <- n+c(1:(hp_vec_length-ind_aux))
}
n_memory <- hp_vec
data_aux <- subset_bc_reads[hp_vec,]
if(all(data_aux$nucleotide=="-")){next()}
pos <- data_aux$position
nucl <- data_aux$nucleotide
ind_order <- c(which(nucl!="-"),which(nucl=="-"))
subset_bc_reads$position[hp_vec] <- pos[order(ind_order)]
if(length( which(subset_bc_reads$position %in% pos_wt_homopolymers & subset_bc_reads$nucleotide=="-"))<length(index_homopolymers)){
cat("n=",n,"\n")
stop()
}
}
subset_bc_reads <- subset_bc_reads %>% arrange(readID,position)
cons_single <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_SINGLe,position),
cutoff_prob = 0)
cons_guppy <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_Guppy,position),
cutoff_prob = 0)
cons_noweight <- weighted_consensus(df = subset_bc_reads %>%
select(nucleotide,p_noweights,position),
cutoff_prob = 0)
muts_single <- detect_mutations(biostr_to_char(cons_single), wildtype)
muts_guppy <- detect_mutations(biostr_to_char(cons_guppy), wildtype)
muts_noweight <- detect_mutations(biostr_to_char(cons_noweight), wildtype)
n_single <- length(muts_single)
n_guppy <- length(muts_guppy)
n_noweight <- length(muts_noweight)
n_tot <- n_single+n_guppy+n_noweight
if(n_tot==0){next()}
df <- data.frame(barcode = rep(bc,n_tot),nseqs=s,repetition=r,
mutation=c(muts_single,muts_guppy,muts_noweight),
method = c(rep("SINGLe", n_single),
rep("Guppy",n_guppy),
rep("No weights",n_noweight)))
write.table(df, append = T, col.names = F,row.names = F, file = file_out_consensus_topten)
}
}
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Compute consensus on subsets of reads by Nanopolish and Medaka
tic <- proc.time()
dir.create(paste0(path_analysis_lib,"/subsets/"))
file.copy(reference_file, paste0(path_analysis_lib,"/subsets/"))
## Make fastq files with subsets
for(bc in unique(KTLib_topBC_reads$BCsequence)){
fastq_file <- readLines(paste0(path_analysis_lib, bc,".fastq"))
name_lines <- grep(pattern="^@",x=fastq_file)
for(ns in subsets){
if(ns> length(name_lines)){next()}
for(r in 1:n_repetitions){
fastq_subset_out <- paste0(path_analysis_lib,"subsets/",bc,"_",ns,"_",r,".fastq")
## Make file of subset of sequences
choose_lines <- sample(name_lines, size= ns)
choose_lines_all <- sort(c(choose_lines,choose_lines+1,choose_lines+2,choose_lines+3))
writeLines(text = fastq_file[choose_lines_all], con = file(fastq_subset_out))
}
}
}
in_files <-
dir(pattern=".fastq", path=paste0(path_analysis_lib,"/subsets/"))
## .sh for minimap2
for(i in seq_along(in_files)){
fastq_file <- in_files[i]
sam_file <- sub(pattern=".fastq", replacement = ".sam", x=fastq_file)
line_minimap <- paste0("minimap2 -ax map-ont KTrefORF_1662.fasta ",fastq_file," > ", sam_file)
cat(line_minimap,file=paste0(path_analysis_lib,"/subsets/Minimap_allBC.sh"), sep="\n", append=i!=1)
}
# .sh for racon
results_racon <- "../ConsensusbyRacon_subsets.txt"
for(i in seq_along(in_files)){
fastq_file <- in_files[i]
sam_file <- sub(pattern=".fastq", replacement = ".sam", x=fastq_file)
racon <- sub(pattern=".fastq", replacement = ".racon.fasta", x=fastq_file)
filename <- sub(pattern=".fastq", replacement = "", x=fastq_file)
## Run racon
line_racon <- paste0("racon ",fastq_file," ",sam_file, " KTrefORF_1662.fasta >> ", racon)
line_results_racon <- paste0("echo '>",filename, "' >> ",results_racon)
line_results_racon2 <- paste0("tail -n 1 ",racon, " >> ",results_racon)
cat(line_racon,line_results_racon,line_results_racon2,file=paste0(path_analysis_lib,"/subsets/Racon_allBC.sh"), sep="\n", append= i!=1)
}
# .sh for Medaka
for(i in seq_along(in_files)){
fastq_file <- in_files[i]
racon <- sub(pattern=".fastq", replacement = ".racon.fasta", x=fastq_file)
filename <- sub(pattern=".fastq", replacement = "", x=fastq_file)
medaka <- sub(pattern=".fastq", replacement = "_medaka.fasta", x=fastq_file)
results_medaka <- paste0("Consensus_",filename,".fasta")
## Run Medaka
line_medaka <- paste0("medaka_consensus -i ",fastq_file," -d ",racon," -o ", medaka," -t 4 ")
line_results_medaka <- paste0("echo '>",filename, "' >> ",results_medaka)
line_results_medaka2 <- paste0("tail -n 1 ",filename,"_medaka.fasta/consensus.fasta", " >> ",results_medaka)
line_remove_medaka <- paste0("rm -r ",medaka)
cat(line_medaka,line_results_medaka,line_results_medaka2,line_remove_medaka,file=paste0(path_analysis_lib,"/subsets/Medaka_",filename,".sh"), sep="\n", append=F)
}
# .sh for nanopolish
for(i in seq_along(in_files)){
fastq_file <- in_files[i]
sam_file <- sub(pattern=".fastq", replacement = ".sam", x=fastq_file)
nanopolish_file <- sub(pattern=".fastq", replacement = "_nanopolish.txt", x=fastq_file)
filename <- sub(pattern=".fastq", replacement = "", x=fastq_file)
nanopolish_results <- paste0(filename,"_ConsensusbyNanopolish_subsets.txt")
bc <- strsplit(filename, split="_")[[1]][1]
fast5_dir <- paste0("../fast5/individualfiles/",bc)
## lines nanopolish
line_nanopolish_index <- paste0("nanopolish index -d ../fast5/ subsets/",fastq_file)
line_nanpolish_sort <- paste0("samtools sort subsets/",sam_file," -o ",filename,".sorted.fasta -T temporal.tmp ")
line_nanopolish_index2 <- paste0("samtools index ", filename,".sorted.fasta")
line_nanopolish_variants <- paste0("nanopolish variants --consensus -o ", nanopolish_file, " -r BCtopten/",fastq_file," -b ",filename,".sorted.fasta -g ", reference_file)
line_results_nanopolish <- paste0("awk '!/^#/ {print \"",filename,"\\t\" $0}' ",nanopolish_file, " >> ",nanopolish_results)
line_remove_nanopolish <- paste0("rm ", filename,"_nanopolish.txt ", filename,".sorted* ")
cat("cd ../ \n",
line_nanopolish_index,
line_nanpolish_sort,
line_nanopolish_index2,
line_nanopolish_variants,
line_results_nanopolish,
line_remove_nanopolish,
file=paste0("subsets/Nanopolish_",filename,".sh"), sep="\n", append=F)
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
cd 3_Analysis/KTLib/subsets
chmod +x Minimap_allBC.sh
./Minimap_allBC.sh
chmod +x Racon_allBC.sh
./Racon_allBC.sh
chmod +x Medaka*
.Medaka*
chmod +x Nanopolish*
./Nanopolish*
cat *_ConsensusbyRacon.txt > ConsensusbyRacon_subsets.txt
cat *_ConsensusbyMedaka.txt > ConsensusbyMedaka_subsets.txt
cat *_ConsensusbyNanopolish.txt > ConsensusbyNanopolish_subsets.txt
minimap2 -ava-ont ../../1_ReferenceFiles/KTrefORF_1662.fasta ConsensusbyRacon_subsets.fasta > ConsensusbyRacon_subsets_aligned.sam
minimap2 -ava-ont ../../1_ReferenceFiles/KTrefORF_1662.fasta ConsensusbyMedaka_subsets.fasta > ConsensusbyMedaka_subsets_aligned.sam
Identify mutations and save table
file_consensus_top10_inLIB <- paste0(path_analysis_lib,"Convergence_10top_mismatches.txt" )
tic <- proc.time()
pos_initial <- 1
pos_final <- length(wildtype)
cigar_reference_counts <- c("M","D","N","=","X")
cigar_query_counts <- c("M","I","S","=","X")
sam_data_medaka <- read.table("subsets/ConsensusbyMedaka_subsets_aligned.sam", skip=2)
medaka_mismatches_subsets <- data.frame(Barcode=NA,Position=NA,base.original=NA,base.mutated=NA)
for(i in 1:nrow(sam_data_medaka)){
line_data <- unlist(sam_data_medaka[i,])
pos_ini_aliref <- as.numeric(line_data[4])
cigar <- line_data[6]
sequence <- strsplit(line_data[10],split="")[[1]]
#Obtain CIGAR vector
cigar.table <- data.frame(n=as.numeric(strsplit(cigar, split="[MIDNSHP=X]")[[1]]),Op= strsplit(cigar, split="[0-9]+")[[1]][-1])
cigar_vec <- unlist(apply(cigar.table,1, function(x){rep(x[2],x[1])}))
names(cigar_vec) <- NULL
if(pos_ini_aliref>1){cigar_vec <- c(rep("D",pos_ini_aliref-1),cigar_vec)}
#Data frame for the original sequence, and reference for the model (full)
mismatches <- data.frame(Barcode =rep(line_data[1], length(cigar_vec)),
cigar=cigar_vec,
base.mutated=NA,pos_in_samalig=NA,Position=NA,base.original=NA)
rows_with_query <- mismatches$cigar%in%cigar_query_counts
rows_with_reference <- mismatches$cigar%in%cigar_reference_counts
mismatches$base.mutated[rows_with_query] <- sequence #Nucleotide
mismatches$base.mutated[is.na(mismatches$base.mutated)] <- "-"
mismatches$pos_in_samalig[rows_with_reference] <- pos_ini_aliref + c(0:(sum(rows_with_reference)-1)) #position - according to aligning reference
mismatches$Position[rows_with_reference] <- 1:length(wildtype) #position - according to single reference
mismatches$base.original[rows_with_reference] <- wildtype
index <- which(mismatches$base.mutated!=mismatches$base.original)
mismatches <- mismatches[index,c("Barcode", "Position","base.original","base.mutated")]
medaka_mismatches_subsets <- medaka_mismatches_subsets %>% rbind(mismatches)
}
medaka_mismatches_subsets <- medaka_mismatches_subsets %>% filter(!is.na(Barcode))
medaka_aux <- do.call(rbind,vapply(medaka_mismatches_subsets[,1],strsplit,split="_"))
medaka_mismatches_subsets <- medaka_mismatches_subsets %>% mutate(Barcode = medaka_aux[,1], Nreads=as.numeric(medaka_aux[,2]), repetition=as.numeric(medaka_aux[,3]))
medaka_mismatches_subsets <- medaka_mismatches_subsets %>% mutate(Method="Medaka", homopolymers=NA, mutation=paste0(base.original,Position,base.mutated)) %>% dplyr::rename(repetition=repetitions)
nanopolish_mismatches_subsets = data.frame(barcode=NA,nseqs=NA,repetition=NA,position=NA,base.original=NA,base.mutated=NA)
results_nanopolish <- system2(paste0("awk \'/^[^#]/ \' BCtopten/nanopolish_consensus/*"), intern = T)
results_nanopolish <- do.call(rbind,vapply(results_nanopolish, strsplit, split="\t"))
results_nanopolish <- results_nanopolish[,c(1,3,5,6)]
nanopolish_aux <- do.call(rbind,vapply(results_nanopolish[,1],strsplit,split="_"))
results_nanopolish <- cbind(nanopolish_aux,results_nanopolish[,-1])
colnames(results_nanopolish) <- c("Barcode","Nreads","repetition", "Position","base.original","base.mutated" )
rownames(results_nanopolish) <- NULL
nanopolish_mismatches_subsets <- data.frame(Barcode = results_nanopolish[,1],
Nreads=as.numeric(results_nanopolish[,2]),
repetition = as.numeric(results_nanopolish[,3]),
Position = as.numeric(results_nanopolish[,4]),
base.original=results_nanopolish[,5],
base.mutated = results_nanopolish[,6])
#remove insertions
ind_insertions <- nchar(nanopolish_mismatches_subsets$base.mutated)>1
nanopolish_mismatches_subsets$base.mutated[ind_insertions] <- vapply(nanopolish_mismatches_subsets$base.mutated[ind_insertions] , substr,1,1)
nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets %>% filter(base.mutated!=base.original)
#Split deletions
index_aux <- which(nchar(nanopolish_mismatches_subsets$base.original)==2)
nanopolish_mismatches_subsets$base.original[index_aux] <- substr(nanopolish_mismatches_subsets$base.original[index_aux],2,2)
nanopolish_mismatches_subsets$base.mutated[index_aux] <- "-"
nanopolish_mismatches_subsets$Position[index_aux] <- as.numeric(nanopolish_mismatches_subsets$Position[index_aux])+1
index_aux_3 <- which(nchar(nanopolish_mismatches_subsets$base.original)==3)
aux_df_1 <- nanopolish_mismatches_subsets[index_aux_3,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"
aux_df_2 <- nanopolish_mismatches_subsets[index_aux_3,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"
nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets[-index_aux_3,]
nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets %>% rbind(aux_df_1) %>% rbind(aux_df_2)
rm(aux_df_1,aux_df_2)
index_aux_4 <- which(nchar(nanopolish_mismatches_subsets$base.original)==4)
aux_df_1 <- nanopolish_mismatches_subsets[index_aux_4,]
aux_df_1$Position <- as.numeric(aux_df_1$Position)+1
aux_df_1$base.original <- substr(aux_df_1$base.original, 2,2)
aux_df_1$base.mutated <- "-"
aux_df_2 <- nanopolish_mismatches_subsets[index_aux_4,]
aux_df_2$Position <- as.numeric(aux_df_2$Position)+2
aux_df_2$base.original <- substr(aux_df_2$base.original, 3,3)
aux_df_2$base.mutated <- "-"
aux_df_3 <- nanopolish_mismatches_subsets[index_aux_4,]
aux_df_3$Position <- as.numeric(aux_df_3$Position)+3
aux_df_3$base.original <- substr(aux_df_3$base.original, 4,4)
aux_df_3$base.mutated <- "-"
nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets[-index_aux_3,]
nanopolish_mismatches_subsets <- nanopolish_mismatches_subsets %>% rbind(aux_df_1) %>% rbind(aux_df_2)%>% rbind(aux_df_3) %>% arrange(Barcode,Nreads,repetition,Position) %>% mutate(Method="Nanopolish",homopolymers=NA, mutation=paste0(base.original,Position,base.mutated))
single_mismatches <- read.table(paste0(path_analysis_lib,"ConsensusSINGLE_toptenBC.txt"), header=F,row.names=NULL, skip=1)
colnames(single_mismatches)<-c("Barcode","Nreads","repetition","mutation","Method")
single_mismatches <- single_mismatches %>%
mutate(base.original = str_sub(mutation,1,1),
base.mutated=str_sub(mutation,-1,-1),
Position=str_sub(mutation,start=2,end=-2)) %>%
mutate(Position = as.numeric(Position), homopolymers="original") #%>% select(-mutation)
write.table(rbind(nanopolish_mismatches_subsets %>% mutate(Method="Nanopolish"),
medaka_mismatches_subsets %>% mutate(Method="Medaka"),
single_mismatches, file=file_consensus_top10_inLIB))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Compute true mutants from all reads according to each method
tic <- proc.time()
KTLib_topBC_true_mutations <- read.table("3_PreprocessedData/Fig3_LargeSet_Consensus.txt", header=T) %>%
dplyr::rename(Method=method) %>%
mutate(Method = recode(Method,!!!methods_capital))%>%
filter(Barcode %in% KTLib_topBC$BCsequence) %>%
filter(Method!="Racon") %>%
filter((Method=="SINGLe" & homopolymers=="sorted_in_VCS") |
is.na(homopolymers) |
(Method!="SINGLe" & homopolymers=="original") )%>%
mutate(mutation=paste0(base.original,Position,base.mutated)) %>%
select(Barcode,Method,mutation)
rownames(KTLib_topBC_true_mutations) <- NULL
#manually replace equivalent mutations
KTLib_topBC_true_mutations <- KTLib_topBC_true_mutations %>%
mutate(mutation=ifelse(mutation=="G677-","G676-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G419-","G416-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G63-","G62-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G429-","G428-", mutation)) %>%
mutate(mutation=ifelse(mutation=="A1545-","A1543-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G244-","G243-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G263-","G261-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G677-","G676-", mutation))
aux_nanopolish <- data.frame(Barcode= KTLib_topBC_true_mutations$Barcode[KTLib_topBC_true_mutations$mutation=="GGAC1306G"],
Method="Nanopolish",
mutation=c("G1307-","A1308-", "C1309-"))
KTLib_topBC_true_mutations <- KTLib_topBC_true_mutations %>%
filter(mutation!="GGAC1306G") %>%
rbind(aux_nanopolish)
compare_mutations <- reshape2::dcast(KTLib_topBC_true_mutations %>%
select(Barcode, Method, mutation),
Barcode+mutation~Method)
KTLib_topBC_true_mutants <- KTLib_topBC_true_mutations %>%
group_by(Barcode,Method) %>%
summarise(n_mutations=n(),mutant = paste0(mutation, collapse = " ")) %>%
ungroup()
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
Compute success rate for consensus calling by method, as a function of reads used
tic <- proc.time()
# consensus_by_method
KTLib_topBC_subsets_mutations <- read.table("3_PreprocessedData/Fig3_ConvergenceConsensus_top10BC.txt", header=T) %>%
filter(Method!="racon") %>%
filter((Method=="single" & homopolymers=="sorted_for_VCS") |
is.na(homopolymers) |
(Method!="single" & homopolymers=="original") ) %>%
select(-homopolymers,-variant) %>%
select(-base.original,-Position,-base.mutated)%>%
mutate(Method = recode(Method,!!!methods_capital))
KTLib_topBC_subsets_mutations <- KTLib_topBC_subsets_mutations %>%
mutate(mutation=ifelse(mutation=="G677-","G676-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G419-","G416-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G63-","G62-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G429-","G428-", mutation)) %>%
mutate(mutation=ifelse(mutation=="A1545-","A1543-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G244-","G243-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G263-","G261-", mutation)) %>%
mutate(mutation=ifelse(mutation=="G677-","G676-", mutation))
indx <- which(KTLib_topBC_subsets_mutations$mutation=="GGAC1306G")
aux_nanopolish <- data.frame(Barcode= rep(KTLib_topBC_subsets_mutations$Barcode[indx],3),
Nreads = rep(KTLib_topBC_subsets_mutations$Nreads[indx], each=3),
repetition = rep(KTLib_topBC_subsets_mutations$repetition[indx], each=3),
Method="Nanopolish",
mutation=rep(c("G1307-","A1308-", "C1309-"), lenght.out=length(indx)))
KTLib_topBC_subsets_mutations <- KTLib_topBC_subsets_mutations %>%
filter(mutation!="GGAC1306G") %>%
rbind(aux_nanopolish)
KTLib_topBC_subsets_mutant <- KTLib_topBC_subsets_mutations %>%
select(Barcode,Nreads,repetition,Method,mutation) %>%
group_by(Barcode,Nreads,repetition,Method) %>%
summarise(n_mutations=n(),mutant=paste(mutation, collapse = " "))
KTLib_topBC_subsets_success_rate <- left_join(KTLib_topBC_subsets_mutant,
KTLib_topBC_true_mutants,
by=c("Barcode","Method"),
suffix=c(".subset",".true")) %>%
mutate(equal = mutant.subset==mutant.true ) %>%
filter(!is.na(n_mutations.true)) %>%
group_by(Barcode,Nreads,Method) %>%
dplyr::summarise(success_rate= sum(equal)/n())
ggplot(KTLib_topBC_subsets_success_rate,aes(x=Nreads,y=success_rate,col=Method) )+geom_point()+geom_line()+
facet_wrap(~Barcode,ncol=2)+scale_color_manual(values=colors_vector_capital[-6])+
ylab("Success rate")
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
fig_3a <- ggplot(KTLib_topBC_subsets_success_rate %>%
filter((Method =="SINGLe"|Method=="Medaka") & Barcode=="TCAGTTATTACGTCCTGCGGACAGTAAGGTCCGAG"),
aes(x=Nreads,y=success_rate,col=Method) )+
geom_point()+
geom_line()+
scale_color_manual(values=colors_vector_capital[1:2])+
scale_y_continuous(limits=c(0,1),breaks = c(0,.25,.5,.75,1))+
geom_hline(aes(yintercept=0.9), col=grey(0.5),lty="dashed")+
theme_plot+
theme(legend.position = c(.9,0.4),
legend.justification = c("right","top"),
legend.title = element_blank(),
legend.spacing.y = unit(0,"line"))+
labs(tag = "A",x="Reads", y="Success rate")
fig_3a
Fig_Sup7<- ggplot(KTLib_topBC_subsets_success_rate %>% filter(Method!= "Racon"),
aes(x=Nreads,y=success_rate,col=Method) )+
geom_point()+geom_line()+
facet_wrap(~Barcode,ncol=2)+scale_color_manual(values=colors_vector_capital[-6])+
scale_y_continuous(breaks=c(0,.5,1))+
geom_hline(aes(yintercept=0.9), col=grey(.5),linetype="dashed")+
ylab("Success rate") +theme_plot+
theme(plot.margin = unit(c(0,5.5,5.5,5.5),"points"),
strip.background = element_rect(fill= "white"),
strip.text = element_text(size=8),
legend.position = "bottom")
Fig_Sup7
ggsave(Fig_Sup7, file=paste0(path_save_figures,"FigureS7.png"),
dpi = 'print', width = 17, height=18,units="cm")
tic <- proc.time()
KTLib_mutations <- read.table("3_PreprocessedData/Fig3_LargeSet_Consensus.txt") %>%
filter(method!="Racon") %>%
filter((method=="single" & homopolymers=="sorted_in_VCS") |
is.na(homopolymers) |
(method!="single" & homopolymers=="original") )
aux_nanopolish <- KTLib_mutations %>%
filter(nchar(base.original)>1) %>%
mutate(base.mutated="-")
aux_nanopolish <- (aux_nanopolish %>% mutate(base.original=str_sub(base.original, 2,2)) ) %>%
rbind(aux_nanopolish%>% mutate(base.original=str_sub(base.original, 3,3))) %>%
rbind(aux_nanopolish%>% mutate(base.original=str_sub(base.original, 4,4)))
KTLib_mutations <- KTLib_mutations %>%
filter(!nchar(base.original)>1) %>%
rbind(aux_nanopolish)
KTLib_nreads_per_barcode <- barcodes_4reads %>%
group_by(BCsequence) %>% dplyr::summarise(Nreads=n()) %>%
mutate(Barcode = BCsequence) %>% select(-BCsequence) %>% select(Barcode, Nreads) %>%
arrange(Barcode)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
tic <- proc.time()
n_mutations <- KTLib_mutations %>%
filter(method=="single") %>%
group_by(Barcode) %>%
dplyr::summarise(n_mutations = n()) %>%
left_join(KTLib_nreads_per_barcode) %>% filter(Nreads>5)
plot_nmutations <- ggplot(n_mutations, aes(x=n_mutations))+
geom_histogram()+
theme_plot+
labs(x="Number of mutations", y="Counts")
mean_n_mutations <- mean(n_mutations$n_mutations)
median_n_mutations <- median(n_mutations$n_mutations)
sd_n_mutations <- sd(n_mutations$n_mutations)
mutation_rate_measured <- mean_n_mutations/1662*1000
sd_n_mutations/1662*1000
plot_nmutations
cat("Mean N mutations: ", round(mean_n_mutations,2),
"\nMedian N mutations: ", round(median_n_mutations,2),
"\nSD N mutations: ", round(sd_n_mutations,2),
"\nMutation rate measured: ", round(mutation_rate_measured,2), "muts/kb")
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
tic <- proc.time()
consensus_mismatches_ms <- KTLib_mutations %>%
filter((method=="Medaka" )| (method=="single"))
nmismatches_ms <- consensus_mismatches_ms %>%
group_by(Barcode, method) %>%
summarise(n_mismatches=n())
nmismatches_ms_difference <- left_join(nmismatches_ms %>% filter(method=="Medaka" ),
nmismatches_ms %>% filter(method=="single" ),
by="Barcode", suffix=c("",".single")) %>%
mutate(difference = n_mismatches.single-n_mismatches) %>%
left_join(KTLib_nreads_per_barcode, by = "Barcode") %>%
ungroup() %>%
mutate(Reads = ifelse(Nreads<=5,"5 or less reads", ifelse(Nreads<=10,"6 to 10 reads","10 or more reads"))) %>%
mutate(Reads=factor(Reads, levels=c("5 or less reads","6 to 10 reads","10 or more reads")))%>%
select(-Nreads) %>% ungroup() %>% filter(!is.na(n_mismatches.single))
nmismatches_ms_difference_hist <- nmismatches_ms_difference %>%
group_by(method,difference,Reads) %>%
summarise(counts=n())
nmismatches_ms_difference_hist <- nmismatches_ms_difference %>%
ungroup()%>%
mutate(difference = ifelse(difference< -6, -6, difference))%>%
group_by(method,difference,Reads) %>%
summarise(counts=n()) %>%
group_by(Reads) %>% mutate(perc = counts/sum(counts)*100)
table_mutations_compare <- nmismatches_ms_difference_hist %>%
mutate(diff = ifelse(difference<0,-1,ifelse(difference==0,0,1)))%>%
group_by(Reads, diff) %>% summarise(n=sum(counts)) %>%
group_by(Reads) %>%
mutate(perc=n/sum(n)*100)
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
fig_3b <- ggplot(table_mutations_compare , aes(x=diff,y=perc))+
geom_bar(position = position_dodge2(preserve="single"),stat="identity")+
xlab("Method that reports more mutations")+ylab("Percentage")+
facet_wrap(~Reads, ncol=1, strip.position = "right", labeller = label_wrap_gen(width=12))+
scale_y_continuous(limits = c(0,110), breaks=c(50,100))+
scale_x_continuous(breaks=seq(-1,1,by=1), labels=c("Medaka", "Both match", "SINGLe"))+
theme(legend.position = c(0,1),
legend.justification = c("left","top"),
legend.title = element_text(size = rel(1)),
legend.text = element_text(size=rel(1)),
legend.spacing.y = unit(0.5,"line"),
legend.key.size = unit(2,"pt"),
plot.margin = unit(c(0,5.5,5.5,5.5),"points"),
strip.background = element_rect(fill=grey(.95)))+
labs(tag = "B")+
geom_text(aes(y=perc,label=paste0(round(perc),"%"),vjust=-.2))
fig_3b
tic <- proc.time()
results_for_entropy<- KTLib_mutations %>%
filter(base.original!="wildtype" ) %>%
filter(Position >0 & Position <=1662) %>%
left_join(KTLib_nreads_per_barcode, by="Barcode") %>%
mutate(group_Nreads_barcode = ifelse(Nreads>10, "> 10 reads",
ifelse(Nreads>5,"6 to 10 reads","5 or less reads"))) %>%
mutate(group_Nreads_barcode=factor(group_Nreads_barcode, levels=c("5 or less reads","6 to 10 reads", "> 10 reads")))%>%
group_by(method,Position,group_Nreads_barcode, homopolymers) %>%
dplyr::summarise(n=n())%>% ungroup() %>%
mutate(Method = recode(method,!!!setNames(c("Nanopolish","Medaka","SINGLe","Guppy","No weights"),c("nanopolish","medaka","single","nanopore","no_weights")))) %>%
select(-method)
skewness <- results_for_entropy%>%
group_by(Method, group_Nreads_barcode) %>%
dplyr::summarise(skewness=skewness(n))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
fig_3c <- ggplot(results_for_entropy %>% filter(Method=="Medaka" | Method == "SINGLe"),
aes(x=Position, y=n, col=Method))+
geom_point(size=0.5)+
scale_color_manual(values=colors_vector_capital)+
ylab("Counts")+
facet_grid(Method~group_Nreads_barcode)+
geom_text(
data = skewness %>% filter(Method=="Medaka" | Method == "SINGLe"),
mapping = aes(x = -Inf, y = -Inf, label = paste0("sk=", round(skewness,2))),
hjust = 0,
vjust = 1,
y=rep(140,6), x=rep(80,6),
col="black"
)+
theme_plot+
theme(legend.position = "none",plot.margin = unit(c(1,5.5,5.5,5.5),"points"),)+
labs(tag = "C")
fig_3c
Figure_S8 <- ggplot(results_for_entropy ,
aes(x=Position, y=n, col=Method))+
geom_point(size=0.5)+scale_color_manual(values=colors_vector_capital[-6])+geom_line()+
ylab("Counts")+
facet_grid(Method~group_Nreads_barcode)+
geom_text(
data = skewness,
mapping = aes(x = -Inf, y = -Inf, label = paste0("sk=", round(skewness,2))),
hjust = 0,
vjust = 1,
y=rep(200,nrow(skewness)), x=rep(80,nrow(skewness)),
col="black"
)+theme_plot+ theme(legend.position = "none",plot.margin = unit(c(0,5.5,5.5,5.5),"points"),strip.background = element_rect(fill="white"),strip.text = element_text(size=8))
Figure_S8
ggsave(Figure_S8, filename="6_Figures/FigureS8.png",dpi = 'print', width = 17, units = "cm", height=20)
Figure3 <- ( fig_3a | fig_3b) / fig_3c
Figure3
ggsave(Figure3, file=paste0(path_save_figures,"Figure3.png"),
dpi = 'print', width = 16, height=16,units="cm")
kit_rates <- data.frame(
Mutation=c("Deletion", "Ts AG TC" ,"Ts GA CT" ,"Tv AT TA","Tv AC TG" ,"Tv GC CG", "Tv GT CA"),
true_perc=c(4.8,17.5,25.5,28.5,4.7,4.1,14.1))
aux <- KTLib_mutations %>%
left_join(KTLib_nreads_per_barcode,by="Barcode") %>%
mutate(reads_number = case_when(
Nreads<6 ~ "4 or 5 reads",
Nreads <11 ~ "6 to 10 reads",
Nreads>10 ~ "more than 10 reads") )%>%
mutate(reads_number= factor(reads_number, levels=c("4 or 5 reads", "6 to 10 reads", "more than 10 reads")))%>%
mutate(Mutation=case_when(
base.mutated=="-"~ "Deletion",
base.original=="A" & base.mutated=="G" ~ "Ts AG TC",
base.original=="T" & base.mutated=="C" ~ "Ts AG TC",
base.original=="G" & base.mutated=="A" ~ "Ts GA CT",
base.original=="C" & base.mutated=="T" ~ "Ts GA CT",
base.original=="A" & base.mutated=="T" ~ "Tv AT TA",
base.original=="T" & base.mutated=="A" ~ "Tv AT TA",
base.original=="A" & base.mutated=="C" ~ "Tv AC TG",
base.original=="T" & base.mutated=="G" ~ "Tv AC TG",
base.original=="G" & base.mutated=="C" ~ "Tv GC CG",
base.original=="C" & base.mutated=="G" ~ "Tv GC CG",
base.original=="G" & base.mutated=="T" ~ "Tv GT CA",
base.original=="C" & base.mutated=="A" ~ "Tv GT CA"
)) %>%
group_by(method,reads_number,Mutation)%>%
summarise(counts = n()) %>%
group_by(method,reads_number) %>%
mutate(counts=counts/sum(counts)*100) %>%
left_join(kit_rates, by="Mutation")%>%
ungroup()%>%
mutate(Method = recode(method,!!!setNames(c("Nanopolish","Medaka","SINGLe","Guppy","No weights"),c("nanopolish","medaka","single","nanopore","no_weights")))) %>%
select(-method)
plot_mr_xy <- ggplot(aux, aes(col=Method, x=true_perc, y=counts, shape=Mutation,label=Mutation))+
geom_point()+
geom_abline(aes(slope=1,intercept=0), lty="dashed", col=grey(.5))+
facet_wrap(~reads_number,ncol=1)+xlim(c(0,30))+scale_shape_manual(values=1:7)+theme_plot+
xlab("By manufacturer (%)")+
ylab("Observed (%)")+theme_plot+ggtitle("Mutation rate")
plot_mr_cor <- ggplot(aux %>% group_by(Method, reads_number) %>%
summarise(cor=cor(counts, true_perc)) %>%
ungroup(), aes(x=reads_number, y=cor, col=Method))+
geom_point()+
geom_line(aes(x=as.numeric(reads_number)))+
ylab("Correlation")+xlab("")+
# ggtitle("Compare observed mutation rate vs. manufacturer's report")+
scale_y_continuous(breaks=c(0.8,0.9,1), limits = c(0.8,1))+theme_plot+
theme(axis.text.x = element_text(angle=90,hjust = 1,vjust=0.5))
FigureS9 <- grid.arrange(plot_mr_xy , plot_mr_cor+theme(legend.position = "none"), ncol=2)
ggsave(FigureS9, file=paste0(path_save_figures,"FigureSMutRate.png"),
dpi = 'print', width = 16, height=12,units="cm")
tic <- proc.time()
a <- load("/media/rocio/Data10TB/Work/Bibliography/RondelezLab/2022_SINGLe/GigaScience_submission/Espada_et_al/5_Revision/01_Subsets/01_KlenTaq/barcode01_allseqs_countspnq.Rdata")
full_fits <- read.table("/media/rocio/Data10TB/Work/Bibliography/RondelezLab/2022_SINGLe/GigaScience_submission/Espada_et_al/5_Revision/01_Subsets/01_KlenTaq/barcode01_allreads_fits.txt", header=T)
full_fits <- full_fits %>%
mutate(rqs = 1-deviance/null_deviance)
error_rate <- data_mut %>%
group_by(pos,strand,nucleotide) %>%
dplyr::summarise(error_rate = sum(count)/(sum(count)+sum(counts.wt)),
counts_mut = sum(count))
full_fits<- full_fits %>%
left_join(error_rate, by=c("strand","pos","nucleotide"))
## Full fits mutated positions
mutationstested <- kt_true_mutations %>%
select(position, nucleotide) %>%
dplyr::rename(pos=position)
full_fits_mut <- left_join(mutationstested, full_fits)
hex <- scales::hue_pal()(4)
names(hex)<- c(303,331,879,1316)
full_fits_mut_examples <- full_fits_mut %>%
filter( (pos==1316 & nucleotide=="A" & strand=="+")|
(pos==879 & nucleotide=="T" & strand=="-")|
(pos==303 & nucleotide=="A" & strand=="-")|
(pos==331 & nucleotide=="T" & strand=="+"))
plot_counts_RSQ <- ggplot(full_fits,aes(x=rqs,y=counts_mut))+
geom_point(col=rgb(0,0,0,0.1), stroke=0, size=2)+
scale_y_log10()+
theme_plot+
geom_point(data=full_fits_mut_examples,
aes(x=rqs,y=counts_mut, col=as.factor(pos)),
size=2,shape="square")+
scale_color_manual(values=hex)+
labs(x="Quality (Q)", y="Counts(C)")+theme(legend.position = "none")
xlim <- 1:50
plots <- list()
for(i in 1:nrow(full_fits_mut_examples)){
nuc <- full_fits_mut_examples$nuc[i]
pot <- full_fits_mut_examples$pos[i]
str <- full_fits_mut_examples$strand[i]
aux_df <- data_mut %>%
dplyr::filter(pos==pot & nucleotide ==nuc & strand==str)%>%
dplyr::select(strand,QUAL,count, counts.wt,counts.scaled,counts.wt.scaled) %>%
dplyr::mutate(tot.scaled=counts.scaled+counts.wt.scaled) %>%
dplyr::mutate(proportion.wt.scaled=counts.wt.scaled/tot.scaled) %>%
arrange(QUAL)
qval <- aux_df$QUAL
yvals <- aux_df$proportion.wt.scaled
no <- !is.na(yvals)
yvals <-yvals[no]
qval <- qval[no]
fitt <- stats::glm(yvals~ qval,family = "binomial")
prediction <- data.frame(QUAL=xlim, fitted = predict(fitt,list(qval=xlim), type = "response"))
# lines(qval,predict(fitt,list(qval=qval),type="response"), col=2, lwd=2)
qq<- format((full_fits_mut_examples %>%
filter(pos==pot & nucleotide ==nuc & strand==str))$rqs,scientific=F, digits=2)
cc<- (full_fits_mut_examples %>%
filter(pos==pot & nucleotide ==nuc & strand==str))$counts
plots[[i]] <- ggplot(aux_df, aes(x=QUAL,y=proportion.wt.scaled))+
geom_point()+
geom_line(data=prediction, aes(x=QUAL, y=fitted),
col=hex[as.character(pot)])+
ggtitle(paste(pot,nuc,str),
subtitle=paste("C:", cc, " QF:",qq))+
labs(x="Qscore", y="ncorrect")+
xlim(range(xlim))+
scale_y_continuous(breaks=c(0,1), limits = c(0,1))+
theme(plot.title = element_text(margin = margin(0,0,0,0)),
plot.subtitle = element_text(margin = margin(0,0,0,0)))
}
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
FigureS10 <- grid.arrange(plot_counts_RSQ , plots[[1]], plots[[4]], plots[[3]], plots[[2]] ,
layout_matrix=matrix(c(1,1,2,3,4,5), byrow = F,nrow=2),
widths=c(2.5,1,1))
ggsave(FigureS10, file=paste0(path_save_figures,"FigureExtra.png"), dpi = 'print',
height = 8.7, width=20,units="cm")
FigureS10
tic <- proc.time()
path_11 <- "/media/rocio/Data10TB/minIONreads/KlenTaq/2019_11_KT/KT7_fullpipeline/KT7/BasecallSuperior/pass/out/Demultiplex/minimap2/SINGLE/"
data_11 <- read.table(paste0(path_11, "barcode05.for_corrected.txt"), header=T)
data_11 <- data_11
error_lines <- which(!data_11$isWT&
!(data_11$position==867 & data_11$nucleotide=="G")&
!(data_11$position==1120 & data_11$nucleotide=="A") &
!(data_11$position==1214 & data_11$nucleotide=="T") &
!(data_11$position==1316 & data_11$nucleotide=="A") &
!(data_11$nucleotide=="-"))
correct_lines <- which(data_11$isWT &
!(data_11$position==867 & data_11$nucleotide=="G")&
!(data_11$position==1120 & data_11$nucleotide=="A") &
!(data_11$position==1214 & data_11$nucleotide=="T") &
!(data_11$position==1316 & data_11$nucleotide=="A") &
!(data_11$nucleotide=="-"))
rand_pos <- sample(1:nrow(data_11), 200)
df11 <- rbind(
data.frame(type="Errors",
difference = abs(data_11$original_quality[error_lines]-data_11$original_quality[error_lines+1]), wt = data_11$isWT[error_lines+1]),
data.frame(type="Correct",
difference = abs(data_11$original_quality[correct_lines]-data_11$original_quality[correct_lines+1]), wt = data_11$isWT[correct_lines+1])) %>%
rbind( data.frame(type="Random",
difference = abs(data_11$original_quality[rand_pos]-data_11$original_quality[sample(1:nrow(data_11), 200)]), wt = data_11$isWT[rand_pos])
) %>%
mutate(difference = ifelse(difference>20, 20, difference))
toc <- proc.time()
cat("it took" , (toc-tic)[3], "s to run this chunk\n"); rm(tic, toc)
FigureS3 <- ggplot(df11, aes(x=difference))+
geom_histogram(binwidth = 1,aes(y=..density.., text=..density..))+
xlab("Qscore difference with next nucleotide")+
ylab("Counts")+
labs(tag="C")+
facet_wrap(~type, scales = "free")
ggsave(FigureS3, file=paste0(path_save_figures,"FigureS3.png"), dpi = 'print',
height = 5, width=15,units="cm")
FigureS3
Depricated nextpolish:
# create_run_cfg <- function(run_cfg_file, ref_fasta, workdir){
# cat("[General]\n", file=run_cfg_file, append=FALSE)
# cat('job_type = local\n', file=run_cfg_file, append=TRUE)
# cat('job_prefix = nextPolish\n', file=run_cfg_file, append=TRUE)
# cat('task = best\n', file=run_cfg_file, append=TRUE)
# cat('rewrite = yes\n', file=run_cfg_file, append=TRUE)
# cat('rerun = 3\n', file=run_cfg_file, append=TRUE)
# cat('parallel_jobs = 2\n', file=run_cfg_file, append=TRUE)
# cat('multithread_jobs = 4\n', file=run_cfg_file, append=TRUE)
# cat('genome =', ref_fasta,' #genome file\n', file=run_cfg_file, append=TRUE)
# cat('genome_size = auto\n', file=run_cfg_file, append=TRUE)
# cat('workdir =',workdir, "\n",file=run_cfg_file, append=TRUE, sep="")
# cat(' polish_options = -p {multithread_jobs}\n', file=run_cfg_file, append=TRUE)
#
# cat('[lgs_option]\n', file=run_cfg_file, append=TRUE)
# cat('lgs_fofn =lgs.fofn\n', file=run_cfg_file, append=TRUE)
# cat('lgs_options = -min_read_len 1k -max_depth 100\n', file=run_cfg_file, append=TRUE)
# cat('lgs_minimap2_options = -x map-ont\n', file=run_cfg_file, append=TRUE)
# return(0)
# }
#
#
# path0 <- "/home/respada/22nextpolish/bc06/"
# setwd(path0)
# run_folder <- "RunFolder/"
# save_folder <- ""
# data_folder <- ""
#
# for(bc in kt7_barcodes){
#
# #Load fastq file with all reads
# all_sequences <- readLines(paste0(path_analysis_lib7, bc,".fastq"))
# name_lines <- grep(pattern="^@",x=all_sequences)
# save_file <- paste0(save_folder,bc, "_ConsensusByNextPolish.fasta")
#
# for(ns in subsets){
# cat("\t",ns,"\n")
# if(ns> length(name_lines)){next()}
# for(r in 1:n_repetitions){
# cat("----------------",ns, r, "\n")
#
# #Create file with subset of sequences
# subset_name = paste0(bc,"_s",ns, "_r",r)
# filename_subset <- paste0(run_folder,subset_name,".fastq")
# choose_lines <- sample(name_lines, size= ns)
# choose_lines_all <- sort(c(choose_lines,choose_lines+1,choose_lines+2,choose_lines+3))
# writeLines(text = all_sequences[choose_lines_all], con = file(filename_subset))
#
# ## Run nextpolish
# system2(paste0("echo '", subset_name,".fastq' >", run_folder, "lgs.fofn"))
# create_run_cfg(run_cfg_file=paste0(run_folder, "run.cfg"),
# ref_fasta=reference_file,
# workdir=paste0(subset_name,"/"))
# system2(paste0("cp ", reference_file, " ",run_folder, reference_file ))
# system2(paste0("cd ",run_folder,"; nextPolish run.cfg"))
# system2(paste0("sed -i '1s/.*/>",subset_name,"'/ ", run_folder, subset_name, "/genome.nextpolish.fasta "))
# system2(paste0("cat ",run_folder, subset_name, "/genome.nextpolish.fasta >>", save_file))
# system2(paste0("rm -r ", run_folder, subset_name))
# system2(paste0("cd ",run_folder,";rm lgs.fofn run.cfg"))
#
# }
# }
# }
Alternative to fill deletions faster:
### OPTION
t0 <- proc.time()
a <- str_locate_all(reads_aligned, "-+")
for(i in seq(a)){
if(nrow(a[[i]])==0){next()}
pos_to_replace <- IRanges(a[[i]][,"start"], a[[i]][,"end"])
pos_to_average_start <- IRanges(a[[i]][,"start"]-1,a[[i]][,"start"]-1)
pos_to_average_end <- IRanges(a[[i]][,"end"]+1,a[[i]][,"end"]+1)
if(start(pos_to_average_start)[1]==0){
start(pos_to_average_start)[1] <- end(pos_to_average_start)[1] <- start(pos_to_average_end)[1]
}
n <- length(pos_to_average_start)
pos_max <- width(reads_aligned)[i]
if(start(pos_to_average_end)[n]>pos_max){
start(pos_to_average_end)[n] <- start(pos_to_average_start)[n]
end(pos_to_average_end)[n] <- start(pos_to_average_start)[n]
}
before <- extractAt(scores_aligned[[i]], at = pos_to_average_start)
before <- unlist(as(PhredQuality(before),"NumericList"))
after <- extractAt(scores_aligned[[i]], at = pos_to_average_end)
after <- unlist(as(PhredQuality(after),"NumericList"))
both <- cbind(before,after)
if(gaps_weights=="mean"){
replace_qscore <- apply(both,1, mean)
}else if(gaps_weights=="minimum"){
replace_qscore <- apply(both,1, min)
}
replace_qscore <- lapply(replace_qscore, function(x){as(x,"PhredQuality")})
replace_qscore_v <- vapply(seq(replace_qscore), function(x){paste0(rep(replace_qscore[[x]], times=width(pos_to_replace)[x]), collapse="")})
# print(scores_aligned[[i]])
scores_aligned[i] <- replaceAt(scores_aligned[i],at = pos_to_replace, value = replace_qscore_v)
# print(scores_aligned[[i]])
}
t1 <- proc.time()
t1-t0
### ORIGINAL
if(verbose){message("Assign values to deletions\n")}
if(verbose){pb=utils::txtProgressBar(min =0,max=length(reads_aligned),style = 3)}
t2 <- proc.time()
for(i in seq(length(reads_aligned))){
# if(verbose){utils::setTxtProgressBar(pb,i)}
## REPLACE DELETION SCORES
del_positions <- BiocGenerics::start(Biostrings::vmatchPattern("-",reads_aligned[i])[[1]])
ndel <- length(del_positions)
if(ndel==0){next()}
nr <- width(reads_aligned)[i]
#If they are at the beginning, I replace by the first Qscore
if(del_positions[1]==1){
n_del_start=1
if(length(del_positions)==1){
n_del_start <- 1
}else{
condition = del_positions[n_del_start+1]==del_positions[n_del_start]+1
while(condition){
n_del_start <- n_del_start+1
condition1 = is.na(del_positions[n_del_start+1])
if(condition1){
condition=FALSE
}else{
condition = del_positions[n_del_start+1]==del_positions[n_del_start]+1
}
}
}
scores_aligned[i] <- Biostrings::replaceAt(scores_aligned[i],at=IRanges(1,n_del_start),as.character(subseq(scores_aligned[i], 1, n_del_start)))
}else{n_del_start=NA}
#If they are at the end, I replace by the last Qscore
if(del_positions[ndel]==nr){
#detect which are the values on the end of the string
Breaks <- c(0, which(diff(del_positions) != 1), length(del_positions))
Breaks_ini <- Breaks[length(Breaks)-1]+1
Breaks_end <- Breaks[length(Breaks)]
del_pos_ini <- del_positions[Breaks_ini]
del_pos_end <- del_positions[Breaks_end]
replacement <- paste0(rep(as.character(subseq(scores_aligned[i], del_pos_ini-1, del_pos_ini-1)), Breaks_end-Breaks_ini+1),collapse="")
scores_aligned[i] <-Biostrings::replaceAt(scores_aligned[i],
at=IRanges(del_pos_ini,del_pos_end),
replacement)
n_del_end <- length(Breaks_ini:Breaks_end)
}else{n_del_end=NA}
if(!is.na(n_del_end)){del_positions <- del_positions[-seq(ndel-n_del_end+1,ndel)]}
if(!is.na(n_del_start)){del_positions <- del_positions[-seq(n_del_start)]}
if(length(del_positions)==0){next()}
ndel <- length(del_positions)
for(j in seq_along(del_positions)){
jmin <- j
while(j < ndel && del_positions[j+1]==del_positions[j]+1){ j <- j+1 }
jmax<-j
q_upstr <- subseq(scores_aligned[i], del_positions[jmin]-1, del_positions[jmin]-1)
q_dwstr <- subseq(scores_aligned[i], del_positions[jmax]+1, del_positions[jmax]+1)
if(gaps_weights=="mean"){
prob_upstr <- as(q_upstr,"NumericList")[[1]]
prob_dwstr <- as(q_dwstr,"NumericList")[[1]]
prob <- mean(prob_dwstr,prob_upstr)
}else if(gaps_weights=="minimum"){
prob_upstr <- as(q_upstr,"NumericList")[[1]]
prob_dwstr <- as(q_dwstr,"NumericList")[[1]]
prob <- min(prob_dwstr,prob_upstr)
}
qscore_new <- as(prob,"PhredQuality")
qscore_new_vec <- paste0(rep(as.character(qscore_new), jmax-jmin+1), collapse="")
scores_aligned[i] <- Biostrings::replaceAt(scores_aligned[i],at=IRanges(del_positions[jmin],del_positions[jmax]),qscore_new_vec)
}
}
t3 <- proc.time()
t3-t2