Untitled Diff
276 linhas
#' ### 3. REMOVE Chimeras and ASSIGN Taxonomy
sink(file="./console_output/06_console_output.txt") ## open sink connection to write console output to file
#+ include=FALSE
# some setup options for outputing markdown files; feel free to ignore these
### 3. REMOVE Chimeras and EXPORT files
knitr::opts_chunk$set(eval = FALSE,
include = TRUE,
# Originally this script also included the step where taxonomy is assigned via dada2::assignTaxonomy()
warning = FALSE,
# We use qiime2 classify-sklearn externally instead, as it produced much less NAs than dada2::assignTaxonomy
message = FALSE,
collapse = TRUE,
dpi = 300,
fig.dim = c(9, 9),
out.width = '98%',
out.height = '98%')
#'
#+ include=FALSE
# this is to load in the previous R environment and necessary packages
# this is to load in the previous R environment and necessary packages
# if you are running the pipeline in pieces with slurm
# if you are running the pipeline in pieces
# Load DADA2 and required packages
# Load DADA2 and required packages
library(dada2); packageVersion("dada2") # the dada2 pipeline
library(dada2); packageVersion("dada2") # the dada2 pipeline
library(ShortRead); packageVersion("ShortRead") # dada2 depends on this
library(ShortRead); packageVersion("ShortRead") # dada2 depends on this
library(dplyr); packageVersion("dplyr") # for manipulating data
library(dplyr); packageVersion("dplyr") # for manipulating data
library(tidyr); packageVersion("tidyr") # for creating the final graph at the end of the pipeline
library(tidyr); packageVersion("tidyr") # for creating the final graph at the end of the pipeline
library(Hmisc); packageVersion("Hmisc") # for creating the final graph at the end of the pipeline
library(Hmisc); packageVersion("Hmisc") # for creating the final graph at the end of the pipeline
library(ggplot2); packageVersion("ggplot2") # for creating the final graph at the end of the pipeline
library(ggplot2); packageVersion("ggplot2") # for creating the final graph at the end of the pipeline
library(plotly); packageVersion("plotly") # enables creation of interactive graphs, especially helpful for quality plots
library(plotly); packageVersion("plotly") # enables creation of interactive graphs, especially helpful for quality plots
load(file = "dada2_ernakovich_Renv.RData")
load(file = "dada2_ernakovich_Renv.RData")
#' Although dada2 has searched for indel errors and subsitutions, there may still be chimeric
#' sequences in our dataset (sequences that are derived from forward and reverse sequences from
# Although dada2 has searched for indel errors and substitutions, there may still be chimeric
#' two different organisms becoming fused together during PCR and/or sequencing). To identify
# sequences in our dataset (sequences that are derived from forward and reverse sequences from
#' chimeras, we will search for rare sequence variants that can be reconstructed by combining
# two different organisms becoming fused together during PCR and/or sequencing). To identify
#' left-hand and right-hand segments from two more abundant "parent" sequences. After removing chimeras, we will use a taxonomy database to train a classifer-algorithm
# chimeras, we will search for rare sequence variants that can be reconstructed by combining
#' to assign names to our sequence variants.
# left-hand and right-hand segments from two more abundant "parent" sequences.
#'
#' For the tutorial 16S, we will assign taxonomy with Silva db v138, but you might want to use other databases for your data. Below are paths to some of the databases we use often. (If you are on your own computer you can download the database you need from this link [https://benjjneb.github.io/dada2/training.html](https://benjjneb.github.io/dada2/training.html)):
#'
#' - 16S bacteria and archaea (SILVA db): ```/mnt/home/ernakovich/shared/db_files/dada2/silva_nr99_v138.1_train_set.fa ```
#'
#' - ITS fungi (UNITE db): ```/mnt/home/ernakovich/shared/db_files/dada2/UNITE_sh_general_release_10.05.2021/sh_general_release_dynamic_10.05.2021.fasta```
#'
#' - 18S protists (PR2 db): ```/mnt/home/ernakovich/shared/db_files/dada2/pr2_version_4.14.0_SSU_dada2.fasta```
#'
# Read in RDS
# Read in RDS
st.all <- readRDS(paste0(table.fp, "/seqtab.rds"))
st.all <- readRDS(paste0(table.fp, "/seqtab.rds"))
# Remove chimeras
# Remove chimeras
# seqtab.nochim <- removeBimeraDenovo(st.all, method="consensus", multithread=TRUE)
seqtab.nochim <- removeBimeraDenovo(st.all, method="consensus", multithread=TRUE)
# Print percentage of our seqences that were not chimeric.
# Print percentage of our seqences that were not chimeric.
# 100*sum(seqtab.nochim)/sum(seqtab)
100*sum(seqtab.nochim)/sum(seqtab)
# PEDRO's alteration, save and export otu table and representative sequences before trying to assign taxonomies
# Write results to disk
# Write results to disk
# saveRDS(seqtab.nochim, paste0(table.fp, "/seqtab_final.rds"))
saveRDS(seqtab.nochim, paste0(table.fp, "/seqtab_final.rds"))
# load data after chimera removal
seqtab.nochim <- readRDS(paste0(table.fp, "/seqtab_final.rds"))
# PEDRO'S ALTERATION: check number of columns (ASV) per rows (samples) of the original object and when reducing the number of ASVs by minimal abundance
# number of samplex X ASVs in full dataset, and then the same dataset by filtering more than 0, 1, 2, 3 occurences
dim(seqtab.nochim)
dim(seqtab.nochim[,colSums(seqtab.nochim)>0])
dim(seqtab.nochim[,colSums(seqtab.nochim)>1])
dim(seqtab.nochim[,colSums(seqtab.nochim)>2])
dim(seqtab.nochim[,colSums(seqtab.nochim)>3])
# PEDRO'S ALTERATION: overwirte the ASV table, removing ASVs that do not appear at least 3 times. this is necessary because the original table with 708k species is too large to handle with 1000Gb memory in R
seqtab.nochim<-seqtab.nochim[,colSums(seqtab.nochim)>2]
#' ### 4. Optional - FORMAT OUTPUT to obtain ASV IDs and repset, and input for mctoolsr
### 4. FORMAT OUTPUT to obtain ASV IDs and repset, and input for mctoolsr
#' For convenience sake, we will now rename our ASVs with numbers, output our
# For convenience sake, we will now rename our ASVs with numbers, and
#' results as a traditional taxa table, and create a matrix with the representative
# create a matrix with the representative sequences for each ASV.
#' sequences for each ASV.
# Flip table
# Flip table
seqtab.t <- as.data.frame(t(seqtab.nochim))
seqtab.t <- as.data.frame(t(seqtab.nochim))
# Pull out ASV repset
# Pull out ASV repset
rep_set_ASVs <- as.data.frame(rownames(seqtab.t))
rep_set_ASVs <- as.data.frame(rownames(seqtab.t))
rep_set_ASVs <- mutate(rep_set_ASVs, ASV_ID = 1:n())
rep_set_ASVs <- mutate(rep_set_ASVs, ASV_ID = 1:n())
rep_set_ASVs$ASV_ID <- sub("^", "ASV_", rep_set_ASVs$ASV_ID)
rep_set_ASVs$ASV_ID <- sub("^", "ASV_", rep_set_ASVs$ASV_ID)
rep_set_ASVs$ASV <- rep_set_ASVs$`rownames(seqtab.t)`
rep_set_ASVs$ASV <- rep_set_ASVs$`rownames(seqtab.t)`
rep_set_ASVs$`rownames(seqtab.t)` <- NULL
rep_set_ASVs$`rownames(seqtab.t)` <- NULL
# Add ASV numbers to table
# Add ASV numbers to table
rownames(seqtab.t) <- rep_set_ASVs$ASV_ID
rownames(seqtab.t) <- rep_set_ASVs$ASV_ID
# Also export files as .txt
write.table(seqtab.t, file = paste0(table.fp, "/seqtab_final.txt"),
sep = "\t", row.names = TRUE, col.names = NA)
# PEDRO's alteration, leave this step for later as it apartently is taking too long
# Assign taxonomy # PEDRO's alteration to ge the right file path, uses the same database
# for 16S
tax <- assignTaxonomy(seqtab.nochim, "/lustre/nobackup/INDIVIDUAL/besch001/dada2_fix/tutorial_repository/silva_nr99_v138.1_train_set.fa", tryRC = TRUE, multithread=TRUE)
# for ITS
# tax <- assignTaxonomy(seqtab.nochim, "/lustre/nobackup/INDIVIDUAL/besch001/dada2_fix/tutorial_repository/UNITE_database/sh_general_release_all_10.05.2021/sh_general_release_dynamic_all_10.05.2021_dev.fasta", tryRC = TRUE, multithread=TRUE)
# Write results to disk
saveRDS(tax, paste0(table.fp, "/tax_final.rds"))
# Add ASV numbers to taxonomy
taxonomy <- as.data.frame(tax)
taxonomy$ASV <- as.factor(rownames(taxonomy))
taxonomy <- merge(rep_set_ASVs, taxonomy, by = "ASV")
rownames(taxonomy) <- taxonomy$ASV_ID
taxonomy_for_mctoolsr <- unite_(taxonomy, "taxonomy",
c("Kingdom", "Phylum", "Class", "Order","Family", "Genus", "ASV_ID"),
sep = ";")
# Write repset to fasta file
# Write repset to fasta file
# create a function that writes fasta sequences
# create a function that writes fasta sequences
writeRepSetFasta<-function(data, filename){
writeRepSetFasta<-function(data, filename){
fastaLines = c()
fastaLines = c()
for (rowNum in 1:nrow(data)){
for (rowNum in 1:nrow(data)){
fastaLines = c(fastaLines, as.character(paste(">", data[rowNum,"name"], sep = "")))
fastaLines = c(fastaLines, as.character(paste(">", data[rowNum,"ASV_ID"], sep = "")))
fastaLines = c(fastaLines,as.character(data[rowNum,"seq"]))
fastaLines = c(fastaLines,as.character(data[rowNum,"ASV"]))
}
}
fileConn<-file(filename)
fileConn<-file(filename)
writeLines(fastaLines, fileConn)
writeLines(fastaLines, fileConn)
close(fileConn)
close(fileConn)
}
}
# Arrange the taxonomy dataframe for the writeRepSetFasta function
# write repset to fasta file
taxonomy_for_fasta <- taxonomy %>%
writeRepSetFasta(rep_set_ASVs, paste0(table.fp, "/repset.fasta"))
unite("TaxString", c("Kingdom", "Phylum", "Class", "Order","Family", "Genus", "ASV_ID"),
sep = ";", remove = FALSE) %>%
unite("name", c("ASV_ID", "TaxString"),
sep = " ", remove = TRUE) %>%
select(ASV, name) %>%
rename(seq = ASV)
# write fasta file
writeRepSetFasta(taxonomy_for_fasta, paste0(table.fp, "/repset.fasta"))
# Merge taxonomy and table
seqtab_wTax <- merge(seqtab.t, taxonomy_for_mctoolsr, by = 0)
seqtab_wTax$ASV <- NULL
# Set name of table in mctoolsr format and save
out_fp <- paste0(table.fp, "/seqtab_wTax_mctoolsr.txt")
names(seqtab_wTax)[1] = "#ASV_ID"
write("#Exported for mctoolsr", out_fp)
suppressWarnings(write.table(seqtab_wTax, out_fp, sep = "\t", row.names = FALSE, append = TRUE))
# Also export files as .txt
# Also export files as .txt
# PEDRO'S alteration, table based on the object taxonomy and not object tax (object tax does not have ASV numbers. this is an issue in rownames eing changed into a column named " Row.names" at some point of the script.
write.table(seqtab.t, file = paste0(table.fp, "/seqtab_final.txt"),
write.table(select(taxonomy, -ASV, -ASV_ID), file = paste0(table.fp, "/tax_final.txt"),
sep = "\t", row.names = TRUE, col.names = NA)
sep = "\t", row.names = TRUE, col.names = NA)
#' ### Summary of output files:
#' 1. seqtab_final.txt - A tab-delimited sequence-by-sample (i.e. OTU) table
### Summary of output files:
#' 2. tax_final.txt - a tab-demlimited file showing the relationship between ASVs, ASV IDs, and their taxonomy
# 1. seqtab_final.txt - A tab-delimited sequence-by-sample (i.e. OTU) table
#' 3. seqtab_wTax_mctoolsr.txt - a tab-delimited file with ASVs as rows, samples as columns and the final column showing the taxonomy of the ASV ID
# 2. repset.fasta - a fasta file with the representative sequence of each ASV. Fasta headers are the ASV IDs.
#' 4. repset.fasta - a fasta file with the representative sequence of each ASV. Fasta headers are the ASV ID and taxonomy string.
#'
#'
# ### 5. Summary of reads throughout pipeline
#' ### 5. Summary of reads throughout pipeline
# Here we track the reads throughout the pipeline to see if any step is resulting in a greater-than-expected loss of reads. If a step is showing a greater than expected loss of reads, it is a good idea to go back to that step and troubleshoot why reads are dropping out. The dada2 tutorial has more details about what can be changed at each step.
#' Here we track the reads throughout the pipeline to see if any step is resulting in a greater-than-expected loss of reads. If a step is showing a greater than expected loss of reads, it is a good idea to go back to that step and troubleshoot why reads are dropping out. The dada2 tutorial has more details about what can be changed at each step.
#'
getN <- function(x) sum(getUniques(x)) # function to grab sequence counts from output objects
getN <- function(x) sum(getUniques(x)) # function to grab sequence counts from output objects
# tracking reads by counts
# tracking reads by counts
filt_out_track <- filt_out %>%
filt_out_track <- filt_out %>%
data.frame() %>%
data.frame() %>%
mutate(Sample = gsub("(\\_R1\\_)(.{1,})(\\.fastq\\.gz)","",rownames(.))) %>%
mutate(Sample = gsub("_16S_R1.fastq.gz","",rownames(.))) %>%
rename(input = reads.in, filtered = reads.out)
rename(input = reads.in, filtered = reads.out)
rownames(filt_out_track) <- filt_out_track$Sample
rownames(filt_out_track) <- filt_out_track$Sample
ddF_track <- data.frame(denoisedF = sapply(ddF[sample.names], getN)) %>%
ddF_track <- data.frame(denoisedF = sapply(ddF[sample.names], getN)) %>%
mutate(Sample = row.names(.))
mutate(Sample = row.names(.))
ddR_track <- data.frame(denoisedR = sapply(ddR[sample.names], getN)) %>%
ddR_track <- data.frame(denoisedR = sapply(ddR[sample.names], getN)) %>%
mutate(Sample = row.names(.))
mutate(Sample = row.names(.))
merge_track <- data.frame(merged = sapply(mergers, getN)) %>%
merge_track <- data.frame(merged = sapply(mergers, getN)) %>%
mutate(Sample = row.names(.))
mutate(Sample = row.names(.))
chim_track <- data.frame(nonchim = rowSums(seqtab.nochim)) %>%
chim_track <- data.frame(nonchim = rowSums(seqtab.nochim)) %>%
mutate(Sample = row.names(.))
mutate(Sample = row.names(.))
track <- left_join(filt_out_track, ddF_track, by = "Sample") %>%
track <- left_join(filt_out_track, ddF_track, by = "Sample") %>%
left_join(ddR_track, by = "Sample") %>%
left_join(ddR_track, by = "Sample") %>%
left_join(merge_track, by = "Sample") %>%
left_join(merge_track, by = "Sample") %>%
left_join(chim_track, by = "Sample") %>%
left_join(chim_track, by = "Sample") %>%
replace(., is.na(.), 0) %>%
replace(., is.na(.), 0) %>%
select(Sample, everything())
select(Sample, everything())
row.names(track) <- track$Sample
row.names(track) <- track$Sample
head(track)
head(track)
# tracking reads by percentage
# tracking reads by percentage
track_pct <- track %>%
track_pct <- track %>%
data.frame() %>%
data.frame() %>%
mutate(Sample = rownames(.),
mutate(Sample = rownames(.),
filtered_pct = ifelse(filtered == 0, 0, 100 * (filtered/input)),
filtered_pct = ifelse(filtered == 0, 0, 100 * (filtered/input)),
denoisedF_pct = ifelse(denoisedF == 0, 0, 100 * (denoisedF/filtered)),
denoisedF_pct = ifelse(denoisedF == 0, 0, 100 * (denoisedF/filtered)),
denoisedR_pct = ifelse(denoisedR == 0, 0, 100 * (denoisedR/filtered)),
denoisedR_pct = ifelse(denoisedR == 0, 0, 100 * (denoisedR/filtered)),
merged_pct = ifelse(merged == 0, 0, 100 * merged/((denoisedF + denoisedR)/2)),
merged_pct = ifelse(merged == 0, 0, 100 * merged/((denoisedF + denoisedR)/2)),
nonchim_pct = ifelse(nonchim == 0, 0, 100 * (nonchim/merged)),
nonchim_pct = ifelse(nonchim == 0, 0, 100 * (nonchim/merged)),
total_pct = ifelse(nonchim == 0, 0, 100 * nonchim/input)) %>%
total_pct = ifelse(nonchim == 0, 0, 100 * nonchim/input)) %>%
select(Sample, ends_with("_pct"))
select(Sample, ends_with("_pct"))
# summary stats of tracked reads averaged across samples
# summary stats of tracked reads averaged across samples
track_pct_avg <- track_pct %>% summarize_at(vars(ends_with("_pct")),
track_pct_avg <- track_pct %>% summarize_at(vars(ends_with("_pct")),
list(avg = mean))
list(avg = mean))
head(track_pct_avg)
head(track_pct_avg)
track_pct_med <- track_pct %>% summarize_at(vars(ends_with("_pct")),
track_pct_med <- track_pct %>% summarize_at(vars(ends_with("_pct")),
list(avg = stats::median))
list(avg = stats::median))
head(track_pct_avg)
head(track_pct_avg)
head(track_pct_med)
head(track_pct_med)
# Plotting each sample's reads through the pipeline
# Plotting each sample's reads through the pipeline
track_plot <- track %>%
track_plot <- track %>%
data.frame() %>%
data.frame() %>%
mutate(Sample = rownames(.)) %>%
mutate(Sample = rownames(.)) %>%
gather(key = "Step", value = "Reads", -Sample) %>%
gather(key = "Step", value = "Reads", -Sample) %>%
mutate(Step = factor(Step,
mutate(Step = factor(Step,
levels = c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim"))) %>%
levels = c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim"))) %>%
ggplot(aes(x = Step, y = Reads)) +
ggplot(aes(x = Step, y = Reads)) +
geom_line(aes(group = Sample), alpha = 0.2) +
geom_line(aes(group = Sample), alpha = 0.2) +
geom_point(alpha = 0.5, position = position_jitter(width = 0)) +
geom_point(alpha = 0.5, position = position_jitter(width = 0)) +
stat_summary(fun.y = median, geom = "line", group = 1, color = "steelblue", size = 1, alpha = 0.5) +
stat_summary(fun.y = median, geom = "line", group = 1, color = "steelblue", size = 1, alpha = 0.5) +
stat_summary(fun.y = median, geom = "point", group = 1, color = "steelblue", size = 2, alpha = 0.5) +
stat_summary(fun.y = median, geom = "point", group = 1, color = "steelblue", size = 2, alpha = 0.5) +
stat_summary(fun.data = median_hilow, fun.args = list(conf.int = 0.5),
stat_summary(fun.data = median_hilow, fun.args = list(conf.int = 0.5),
geom = "ribbon", group = 1, fill = "steelblue", alpha = 0.2) +
geom = "ribbon", group = 1, fill = "steelblue", alpha = 0.2) +
geom_label(data = t(track_pct_avg[1:5]) %>% data.frame() %>%
geom_label(data = t(track_pct_avg[1:5]) %>% data.frame() %>%
rename(Percent = 1) %>%
rename(Percent = 1) %>%
mutate(Step = c("filtered", "denoisedF", "denoisedR", "merged", "nonchim"),
mutate(Step = c("filtered", "denoisedF", "denoisedR", "merged", "nonchim"),
Percent = paste(round(Percent, 2), "%")),
Percent = paste(round(Percent, 2), "%")),
aes(label = Percent), y = 1.1 * max(track[,2])) +
aes(label = Percent), y = 1.1 * max(track[,2])) +
geom_label(data = track_pct_avg[6] %>% data.frame() %>%
geom_label(data = track_pct_avg[6] %>% data.frame() %>%
rename(total = 1),
rename(total = 1),
aes(label = paste("Total\nRemaining:\n", round(track_pct_avg[1,6], 2), "%")),
aes(label = paste("Total\nRemaining:\n", round(track_pct_avg[1,6], 2), "%")),
y = mean(track[,6]), x = 6.5) +
y = mean(track[,6]), x = 6.5) +
expand_limits(y = 1.1 * max(track[,2]), x = 7) +
expand_limits(y = 1.1 * max(track[,2]), x = 7) +
theme_classic()
theme_classic()
track_plot
track_plot
#'
# Write results to disk
# Write results to disk
saveRDS(track, paste0(project.fp, "/tracking_reads.rds"))
saveRDS(track, paste0(project.fp, "/tracking_reads.rds"))
saveRDS(track_pct, paste0(project.fp, "/tracking_reads_percentage.rds"))
saveRDS(track_pct, paste0(project.fp, "/tracking_reads_percentage.rds"))
saveRDS(track_plot, paste0(project.fp, "/tracking_reads_summary_plot.rds"))
saveRDS(track_plot, paste0(project.fp, "/tracking_reads_summary_plot.rds"))
ggsave(plot = track_plot, filename = paste0(project.fp, "/tracking_reads_summary_plot.png"), width = 10, height = 10, dpi = "retina")
ggsave(plot = track_plot, filename = paste0(project.fp, "/tracking_reads_summary_plot.png"), width = 10, height = 10, dpi = "retina")
#' | <span> |
## Next Steps
#' | :--- |
# You can now transfer over the output files onto your local computer.
#' | **STOP - 06_remove_chimeras_assign_taxonomy_dada2_tutorial_16S.R:** If you are running this on Premise, make sure that you are using the appropriate database before running this step with slurm. |
# The table and taxonomy can be read into R with 'mctoolsr' package or another R package of your choosing.
#' | <span> |
#
#'
# ### Post-pipeline considerations
#' ## Next Steps
# After following this pipeline, you will need to think about the following in downstream applications:
#' You can now transfer over the output files onto your local computer.
#' The table and taxonomy can be read into R with 'mctoolsr' package or another R package of your choosing.
# 1. Remove mitochondrial and chloroplast sequences
#'
# 2. Remove reads assigned as eukaryotes
#' ### Post-pipeline considerations
# 3. Remove reads that are unassigned at domain level (also consider removing those unassigned at phylum level)
#' After following this pipeline, you will need to think about the following in downstream applications:
# 4. Normalize or rarefy your ASV table
#'
#' 1. Remove mitochondrial and chloroplast sequences
# Enjoy your data!
#' 2. Remove reads assigned as eukaryotes
#' 3. Remove reads that are unassigned at domain level (also consider removing those unassigned at phylum level)
# this is to save the R environment if you are running the pipeline in pieces
#' 4. Normalize or rarefy your ASV table
#'
#' Enjoy your data!
#'
#+ include=FALSE
# this is to save the R environment if you are running the pipeline in pieces with slurm
save.image(file = "dada2_ernakovich_Renv.RData")
save.image(file = "dada2_ernakovich_Renv.RData")
sink() ## close sink connection