All of the code in this page is meant to be run on the command line unless otherwise stated.
First, download 16S rRNA and shotgun metadata from the HMP's website:
# Download list of shotgun samples
wget http://hmpdacc.org/HMASM/HMASM-690.csv
# Download list of 16S samples
wget http://downloads.hmpdacc.org/data/HMR16S/SRP002395_metadata_lmd.tar.gz
# Decompress it
tar -zxvf SRP002395_metadata_lmd.tar.gz
# Dowload metaphlan taxon table
wget http://downloads.hmpdacc.org/data/HMSMCP/HMP.ab.txt.bz2
# Decompress it
bzip2 -d HMP.ab.txt.bz2
Filter 16S "Buccal mucosa" and "Tongue Dorsum" sample runs by the V3-V5 regions (same region for both):
# Buccal mucosa
grep -h "Buccal mucosa" corrected-metadata-v1/* | grep "V5-V3" | awk -F"\t" '$1 != "NULL"' > BuccalMucosa-16S-AllRuns.tsv
# Tongue dorsum
grep -h "Tongue dorsum" corrected-metadata-v1/* | grep "V5-V3" | awk -F"\t" '$1 != "NULL"' > TongueDorsum-16S-AllRuns.tsv
Intersect 16S and shotgun samples which have runs for both sequencing methods (just SRS):
# SRS which have shotgun for "Buccal Mucosa"
awk -F"\t" '{print $10}' BuccalMucosa-16S-AllRuns.tsv | sort | uniq | while read SRS
do
grep $SRS HMASM-690.csv
done | grep -o "SRS[0-9]*" | head -n 10 > BuccalMucosa-16S-WGS-10Samples.tsv
# SRS which has shotgun for "Tongue Dorsum"
awk -F"\t" '{print $10}' TongueDorsum-16S-AllRuns.tsv | sort | uniq | while read SRS
do
grep $SRS HMASM-690.csv
done | grep -o "SRS[0-9]*" | head -n 10 > TongueDorsum-16S-WGS-10Samples.tsv
Based on the SRS, select 16S runs for samples whose intersection was achieved (complete information from the 16S metadata files):
# For "Buccal Muccosa"
# If every print is "1" it is OK
echo "Buccal Mucosa (check: must be 1 for every run)"
cat BuccalMucosa-16S-WGS-10Samples.tsv | while read SRS
do
grep -h $SRS corrected-metadata-v1/* | awk -F"\t" '$1 != "NULL"' | awk -F"\t" '{print $1}' | while read SRR
do
echo $SRR: $(awk -F"\t" '$1 != "NULL"' corrected-metadata-v1/$SRR.lmd | wc -l)
done
done
# For "Tongue Dorsum"
# If every print is "1" it is OK
echo "Tongue Dorsum (check: must be 1 for every run)"
cat TongueDorsum-16S-WGS-10Samples.tsv | while read SRS
do
grep -h $SRS corrected-metadata-v1/* | awk -F"\t" '$1 != "NULL"' | awk -F"\t" '{print $1}' | while read SRR
do
echo $SRR: $(awk -F"\t" '$1 != "NULL"' corrected-metadata-v1/$SRR.lmd | wc -l)
done
done
Download the V3-V5 OTU table generated by QIIME processing (from HMP):
# Download the OTU table
wget http://downloads.hmpdacc.org/data/HMQCP/otu_table_psn_v35.txt.gz
# Unzip
gzip -d otu_table_psn_v35.txt.gz
Create a SRS-PSN map to filter samples from the HMP OTU table:
# Download the PSN map
wget http://hmpdacc.org/doc/ppAll_V35_map.txt
# Filter buccal mucosa samples
cat BuccalMucosa-16S-WGS-10Samples.tsv | while read SRS
do
grep $SRS ppAll_V35_map.txt | grep "Buccal_mucosa" | awk -F"\t" '{print $8"\t"$3}'
done | sort | uniq > BuccalMucosa-SRS-PSN-Map.tsv
# Filter tongue dorsum samples
cat TongueDorsum-16S-WGS-10Samples.tsv | while read SRS
do
grep $SRS ppAll_V35_map.txt | grep "Tongue_dorsum" | awk -F"\t" '{print $8"\t"$3}'
done | sort | uniq > TongueDorsum-SRS-PSN-Map.tsv
This step was performed in R. Get the selected SRSs for both Buccal Mucosa and Tongue Dorsum samples and filter them from the final OTU files:
- 20 samples (10 for Buccal Mucosa, 10 for Tongue Dorsum)
- OTUs filtered according to a minimum of 3 ocurrences (considering all the samples)
## Get data for ALL SAMPLES
OTUTable <- read.delim(file = "otu_table_psn_v35.txt", header = TRUE, sep = "\t", skip = 1)
## Load the SRS -> PSN table
BuccalMucosa_SRS_PSN_Table <- read.delim("BuccalMucosa-SRS-PSN-Map.tsv", header = FALSE, sep = "\t", col.names = c("SRS", "PSN"))
TongueDorsum_SRS_PSN_Table <- read.delim("TongueDorsum-SRS-PSN-Map.tsv", header = FALSE, sep = "\t", col.names = c("SRS", "PSN"))
## Add the column index (SRS -> PSN -> Column)
BuccalMucosa_SRS_PSN_Table$Column <- 0
TongueDorsum_SRS_PSN_Table$Column <- 0
for (i in 1:nrow(BuccalMucosa_SRS_PSN_Table)) {
BuccalMucosa_SRS_PSN_Table$Column[i] <- grep(BuccalMucosa_SRS_PSN_Table$PSN[i], colnames(OTUTable))
TongueDorsum_SRS_PSN_Table$Column[i] <- grep(TongueDorsum_SRS_PSN_Table$PSN[i], colnames(OTUTable))
}
## Filter by the columns of interest
OTUTable_BM_TD <- OTUTable[,c(1, BuccalMucosa_SRS_PSN_Table$Column, TongueDorsum_SRS_PSN_Table$Column, ncol(OTUTable))]
## Rename columns, incluing PSN to SRS translation
colnames(OTUTable_BM_TD) <- c("OTU_ID",
as.character(BuccalMucosa_SRS_PSN_Table$SRS),
as.character(TongueDorsum_SRS_PSN_Table$SRS),
"taxonomy")
## Get rows whose sum is greater than
ValuesColmuns <- 2:(ncol(OTUTable_BM_TD)-1)
RowSums <- apply(OTUTable_BM_TD[,ValuesColmuns], 1, function(X) {
sum(X)
})
RowSumsGreaterThan3 <- which(RowSums > 3)
## Filter by rows of interesting
OTUTable_BM_TD <- OTUTable_BM_TD[RowSumsGreaterThan3,]
## Export the OTU Table
write.table(OTUTable_BM_TD, "OTU-Table-16S.tsv", quote = FALSE, sep = "\t", row.names = FALSE)