# Snakemake file - input quality controlled fastq reads to generate asv
# Pankaj chejara
# Base snakefile: https://github.com/shu251/tagseq-qiime2-snakemake/blob/master/Snakefile-asv
"config.yaml"
configfile:
import io
import os
import pandas as pd
import pathlib
##########################################################
# SET CONFIG VARS
##########################################################
= config["project"]
PROJ = config["scratch"]
SCRATCH = config["raw_data"]
INPUTDIR = config['outputDIR']
OUTPUTDIR = config["manifest"]
MANIFEST = config["metadata"]
METADATA = config['sampling_depth']
SAMPLING_DEPTH
# Database information
= config["database"]
DB = config["database_classifier"]
DB_classifier = config["database_tax"]
DB_tax
all:
rule input:
= SCRATCH + "/" + OUTPUTDIR +"/" + PROJ + "-PE-demux.qza",
q2_import = SCRATCH + "/" + OUTPUTDIR +"/" + PROJ + "-PE-demux-noprimer.qza",
q2_primerRM # Visualization
= SCRATCH + "/" + OUTPUTDIR + "/viz/" + PROJ + "-PE-demux.qzv",
raw = SCRATCH + "/" + OUTPUTDIR + "/viz/" + PROJ + "-PE-demux-noprimer.qzv",
primer # Dada2 results
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-asv-table.qza",
table = SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-rep-seqs.qza",
rep = SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-stats-dada2.qza",
stats = SCRATCH + "/" + OUTPUTDIR + "/viz/" + PROJ + "-stats-dada2.qzv",
stats_viz # Taxonomic table
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-tax_sklearn.qza",
sklearn = SCRATCH + "/" + OUTPUTDIR + "/asv/" + "feature-table.biom",
table_biom = SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-asv-table.tsv",
table_tsv = SCRATCH + "/" + OUTPUTDIR + "/asv/" + "taxonomy.tsv",
table_tax # Phylogenetic outputs
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-aligned-rep-seqs.qza",
aligned_seqs = SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-masked-aligned-rep-seqs.qza",
aligned_masked = SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-unrooted-tree.qza",
unrooted_tree = SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-rooted-tree.qza",
rooted_tree # Diversity metrics
= directory(SCRATCH + "/qiime2/diversity")
output_dir
##########################################################
# LOAD DATA
##########################################################
rule import_qiime:input:
MANIFEST
output:= SCRATCH + "/" + OUTPUTDIR +"/" + PROJ + "-PE-demux.qza"
q2_import
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_q2.log"
SCRATCH
params:type="SampleData[PairedEndSequencesWithQuality]",
="PairedEndFastqManifestPhred33"
input_format
shell:"""qiime tools import \
--type {params.type} \
--input-path {input} \
--output-path {output.q2_import} \
--input-format {params.input_format}
"""
##########################################################
# REMOVE PRIMERS
##########################################################
rule rm_primers:input:
= SCRATCH + "/" + OUTPUTDIR +"/" + PROJ + "-PE-demux.qza"
q2_import
output:= SCRATCH + "/" + OUTPUTDIR +"/" + PROJ + "-PE-demux-noprimer.qza"
q2_primerRM
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_primer_q2.log"
SCRATCH
shell:"""qiime cutadapt trim-paired \
--i-demultiplexed-sequences {input.q2_import} \
--p-front-f {config[primerF]} \
--p-front-r {config[primerR]} \
--p-error-rate {config[primer_err]} \
--p-overlap {config[primer_overlap]} \
--o-trimmed-sequences {output.q2_primerRM}"""
##########################################################
# QC STATS
##########################################################
rule get_stats:input:
= SCRATCH + "/" + OUTPUTDIR +"/" + PROJ + "-PE-demux.qza",
q2_import = SCRATCH + "/" + OUTPUTDIR +"/" + PROJ + "-PE-demux-noprimer.qza"
q2_primerRM
output:= SCRATCH + "/" + OUTPUTDIR + "/viz/" + PROJ + "-PE-demux.qzv",
raw = SCRATCH + "/" + OUTPUTDIR + "/viz/" + PROJ + "-PE-demux-noprimer.qzv"
primer
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_getviz_q2.log"
SCRATCH
shell:"""
qiime demux summarize --i-data {input.q2_import} --o-visualization {output.raw}
qiime demux summarize --i-data {input.q2_primerRM} --o-visualization {output.primer}
"""
##########################################################
# DENOISE & ASVs
##########################################################
rule dada2:input:
= SCRATCH + "/" + OUTPUTDIR +"/" + PROJ + "-PE-demux-noprimer.qza"
q2_primerRM
output:= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-asv-table.qza",
table = SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-rep-seqs.qza",
rep = SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-stats-dada2.qza"
stats
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_dada2_q2.log"
SCRATCH
shell:"""qiime dada2 denoise-paired \
--i-demultiplexed-seqs {input.q2_primerRM} \
--p-trunc-q {config[truncation_err]} \
--p-trunc-len-f {config[truncation_len-f]} \
--p-trunc-len-r {config[truncation_len-r]} \
--o-table {output.table} \
--o-representative-sequences {output.rep} \
--o-denoising-stats {output.stats}"""
rule dada2_stats:input:
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-stats-dada2.qza"
stats
output:= SCRATCH + "/" + OUTPUTDIR + "/viz/" + PROJ + "-stats-dada2.qzv"
stats_viz
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_dada2-stats_q2.log"
SCRATCH
shell:"""qiime metadata tabulate \
--m-input-file {input.stats} \
--o-visualization {output.stats_viz}"""
##########################################################
# TAXONOMIC ASSIGNMENT
##########################################################
rule assign_tax:input:
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-rep-seqs.qza",
rep = DB_classifier
db_classified
output:= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-tax_sklearn.qza"
sklearn
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_sklearn_q2.log"
SCRATCH
shell:"""qiime feature-classifier classify-sklearn \
--i-classifier {input.db_classified} \
--i-reads {input.rep} \
--o-classification {output.sklearn}"""
##########################################################
# TAXONOMIC TABLE GENERATION
##########################################################
rule gen_table:input:
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-asv-table.qza"
table
output:= SCRATCH + "/" + OUTPUTDIR + "/asv/" + "feature-table.biom"
table_biom
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_exportBIOM_q2.log"
SCRATCH
params:+ "/" + OUTPUTDIR + "/asv/")
directory(SCRATCH
shell:"qiime tools export --input-path {input.table} --output-path {params}"
rule convert:input:
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + "feature-table.biom"
table_biom
output:+ "/" + OUTPUTDIR + "/asv/" + PROJ + "-asv-table.tsv"
SCRATCH
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_exportTSV_q2.log"
SCRATCH
shell:"biom convert -i {input} -o {output} --to-tsv"
rule gen_tax:input:
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-tax_sklearn.qza"
sklearn
output:= SCRATCH + "/" + OUTPUTDIR + "/asv/" + "taxonomy.tsv",
table_tax
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_exportTAXTSV_q2.log"
SCRATCH
params:+ "/" + OUTPUTDIR + "/asv/")
directory(SCRATCH
shell:"qiime tools export --input-path {input.sklearn} --output-path {params}"
##########################################################
# PHYLOGENETIC TREE
##########################################################
rule phy_tree:input:
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-rep-seqs.qza",
rep
output:= SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-aligned-rep-seqs.qza",
aligned_seqs = SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-masked-aligned-rep-seqs.qza",
aligned_masked = SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-unrooted-tree.qza",
unrooted_tree = SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-rooted-tree.qza",
rooted_tree
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_phylogeneticTREE_q2.log"
SCRATCH
shell:"""qiime phylogeny align-to-tree-mafft-fasttree \
--i-sequences {input.rep} \
--o-alignment {output.aligned_seqs} \
--o-masked-alignment {output.aligned_masked} \
--o-tree {output.unrooted_tree} \
--o-rooted-tree {output.rooted_tree}"""
##########################################################
# DIVERSITY METRICS
##########################################################
rule div_met:input:
= SCRATCH + "/" + OUTPUTDIR + "/asv/" + "tree/" + PROJ + "-rooted-tree.qza",
rooted_tree = SCRATCH + "/" + OUTPUTDIR + "/asv/" + PROJ + "-asv-table.qza"
table
output:= directory(SCRATCH + "/qiime2/diversity")
output_dir
log:+ "/" + OUTPUTDIR + "/logs/" + PROJ + "_phylogeneticTREE_q2.log"
SCRATCH
shell:"""qiime diversity core-metrics-phylogenetic \
--i-phylogeny {input.rooted_tree} \
--i-table {input.table} \
--p-sampling-depth {SAMPLING_DEPTH} \
--m-metadata-file {METADATA} \
--output-dir {output.output_dir}"""
Streamlining the analysis process
In the previous chapters, we have seen various steps in generating taxonomic assignment for raw reads data. Manual execution of all those steps, one by one, could be time-consuming and may be prone to errors. To address these challenges, there are several workflow management tools available (e.g., snakemake, nextflow). These tools automate the execution of the entire pipeline and also enable reproducibility of running the same workflow with different datasets.
Here, we will see how to use Snakemake, a workflow management tool, to streamline the entire process of taxonomy generation.
Snakemake configuration file
As the first step, we will create a configuration file where we will keep all needed information for our analysis. It includes project directory, names of output files created in the analysis, parameters values for tools, etc.
This configuration file has information in the format of key:value
. The below file is a configuration file which we are going to use for our short analysis (i.e., primer removal -> asv generation -> taxonomy assignment).
## Basic config
project: trial
scratch: /Users/pankaj/Documents/Metrosert/public_dataset/zakular2014
raw_data: partial_raw_data
outputDIR: partial_raw_asv
manifest: manifest_partial.csv
metadata: metadata_partial.tsv
## 16S adapters from Zackular et al., 2014
primerF: GTGCCAGCMGCCGCGGTAA
primerR: GGACTACHVGGGTWTCTAAT
primer_err: 0.4
primer_overlap: 3
## Reference database
database: silva-138.1-ssu-nr99-seqs-515f-806r.qza
database_classifier: silva_classifier.qza
database_tax: silva-138.1-ssu-nr99-tax.qza
## DADA2 - ASV flags
truncation_err: 2
truncation_len-f: 150
truncation_len-r: 140
truncation_err: 2
quality_err: 2
## Diversity metrics
sampling_depth: 500
Pipeline
Now, we will put our analysis codes in one place to be used by Snakemake. To do that, we need to create a file without specifying any extension. For example, we create a file Snakefile-asv
. Next, we will write our analysis codes in the file a specific format (which we will see later).
Basically, we will do the following
- Specify configuration file
- Extract all information provided in configuration file
- Write rules for each analysis step
References
- https://github.com/shu251/tagseq-qiime2-snakemake/blob/master/Snakefile-asv
- https://forum.qiime2.org/t/qiime2-snakemake-workflow-tutorial-18s-16s-tag-sequencing/11334