-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDada2.R
175 lines (124 loc) · 5.37 KB
/
Dada2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
####Preparing the Data####
setwd("Working/Dir")
library("dada2")
library("devtools")
library("dplyr")
library("ggplot2")
library("microbiome")
library("phangorn")
library("phyloseq")
library("Rcpp")
library("reshape2")
library("tidyr")
library("vegan")
##
path = "Reads"
fnFs = sort(list.files(path, pattern="_1.fq.gz"))
fnRs = sort(list.files(path, pattern="_2.fq.gz"))
sample.names = sapply(strsplit(fnFs, "_"), `[`, 1)
show(sample.names)
fnFs = file.path(path, fnFs)
fnRs = file.path(path, fnRs)
filt_path = file.path(path, "filtered")
filtFs = file.path(filt_path, paste0(sample.names, "_F_filt.fastq"))
filtRs = file.path(filt_path, paste0(sample.names, "_R_filt.fastq"))
plotQualityProfile(fnFs[1:2])
plotQualityProfile(fnRs[1:2])
dev.off()
####Filter and Trimming####
out = filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(200,160), trimLeft = c(19,20), maxN=0, maxEE=c(2,2), truncQ=2, rm.phix=TRUE, compress=TRUE, multithread=TRUE)
####Error Correctio Derreplication n####
errF = learnErrors(filtFs, nbases = 1e8, multithread=TRUE, randomize=TRUE, MAX_CONSIST=15)
#100206400 total bases in 622400 reads from 30 samples will be used for learning the error rates.
errR = learnErrors(filtRs, nbases = 1e8, multithread=TRUE, randomize=TRUE, MAX_CONSIST=15)
#101696000 total bases in 726400 reads from 35 samples will be used for learning the error rates.
mergers = vector("list", length(sample.names))
names(mergers) = sample.names
names(filtFs) = sample.names
names(filtRs) = sample.names
plotErrors(errF, nominalQ=TRUE)
for(sam in sample.names) {
cat("Processing:", sam, "\n")
derepF = derepFastq(filtFs[[sam]])
ddF = dada(derepF, err=errF, multithread=TRUE)
derepR = derepFastq(filtRs[[sam]])
ddR = dada(derepR, err=errR, multithread=TRUE)
merger = mergePairs(ddF, derepF, ddR, derepR)
mergers[[sam]] = merger
}
rm(derepF); rm(derepR)
####Merge Pairs####
seqtab = makeSequenceTable(mergers)
seqtab.nochim = removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE)
#Identified 9374 bimeras out of 9676 input sequences.
####Exporting ASVs####
getN = function(x) sum(getUniques(x))
track2 = cbind(out, sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim))
colnames(track2) <- c("input", "filtered", "merged", "tabled", "nonchim")
write.table(track2, "track.txt", sep = "\t", quote = F)
uniquesToFasta(getUniques(seqtab.nochim), "seqtab.nochim.fasta", ids=paste0("sq", seq(length(getUniques(seqtab.nochim)))))
export <- t(seqtab.nochim)
rownames(export) = paste0("sq", seq(length(getUniques(seqtab.nochim))))
export <- cbind('#OTUID' = rownames(export), export)
write.table(export, "otu_table.dada.nochim.txt", sep='\t', row.names=FALSE, quote=FALSE)
write.table(taxa, "taxa_table.dada.nochim.txt", sep='\t', row.names=FALSE, quote=FALSE)
sum(seqtab.nochim)/sum(seqtab)
#[1] 0.8130383
####Assign taxonomy - SILVA####
taxa <- assignTaxonomy(seqtab.nochim, "silva_nr_v132_train_set.fa", multithread=TRUE)
taxa <- addSpecies(taxa, allowMultiple = TRUE, "silva_species_assignment_v132.fa")
#inspect the taxonomic assignments#
taxa.print <- taxa # Removing sequence rownames for display only
rownames(taxa.print) <- NULL
head(taxa.print)
#### giving seq headers more manageable names (ASV_1, ASV_2...)####
asv_seqs <- colnames(seqtab.nochim)
asv_headers <- vector(dim(seqtab.nochim)[2], mode="character")
for (i in 1:dim(seqtab.nochim)[2]) {
asv_headers[i] <- paste(">ASV", i, sep="_")
}
asv_tax <- taxa
row.names(asv_tax) <- sub(">", "", asv_headers)
write.table(asv_tax, "ASVs_taxonomy.tsv", sep="\t", quote=F, col.names=NA)
#Construct phylogenetic tree
seqs <- getSequences(taxa)
names(seqs) <- seqs # This propagates to the tip labels of the tree
alignment <- AlignSeqs(DNAStringSet(seqs), anchor=NA)
library(phangorn)
phang.align <- phyDat(as(alignment, "matrix"), type="DNA")
dm <- dist.ml(phang.align)
treeNJ <- NJ(dm) # Note, tip order != sequence order
fit = pml(treeNJ, data=phang.align)
## negative edges length changed to 0!
fitGTR <- update(fit, k=4, inv=0.2)
fitGTR <- optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE,
rearrangement = "stochastic", control = pml.control(trace = 0))
detach("package:phangorn", unload=TRUE)
####Phyloseq Object####
sample_info_tab <- read.table("sample_info_shime.tsv", header=T, row.names=1,
check.names=F, sep="\t")
Twin_Shime_Longitudinal_Study <- phyloseq(tax_table(taxa), sample_data(sample_info_tab),
otu_table(seqtab, taxa_are_rows = FALSE),phy_tree(fitGTR$tree))
taxa_names(Twin_Shime_Longitudinal_Study) <- paste0("Seq", seq(ntaxa(Twin_Shime_Longitudinal_Study)))
#########################################################################################
summarize_phyloseq(Twin_Shime_Longitudinal_Study)
pseq <- Twin_Shime_Longitudinal_Study
#Pick metadata as data.frame:
meta <- meta(pseq)
#Renomeando Tax
taxa_names(pseq) <- paste0("Seq", seq(ntaxa(pseq)))
#Taxonomy table:
taxonomy <- tax_table(pseq)
#Abundances for taxonomic groups (‘OTU table’) as a TaxaxSamples matrix:
# Absolute abundances
otu.absolute <- abundances(pseq)
# Relative abundances
otu.relative <- abundances(pseq, "compositional")
##Tree
phylogenetic<- phy_tree(pseq)
###Read Filter
##FALTA essa
track2 <-read.table("track.txt", header=T,check.names=F, sep="\t")
ma <- data.matrix(track2)
barplot((ma), main ="Track Reads", font.axis = 1, cex.axis=1 )
barplot2(ma)