diff --git a/.travis.yml b/.travis.yml index 63ead4f..22d0e4a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,6 +16,7 @@ before_install: install: - conda create --yes -n env_name python=$PYTHON_VERSION --file conda_requirements.txt - source activate env_name + - conda install -y "setuptools<20.7.0" - pip install -r pip_requirements.txt - pip install -e . --no-deps diff --git a/americangut/per_category.py b/americangut/per_category.py new file mode 100644 index 0000000..3822450 --- /dev/null +++ b/americangut/per_category.py @@ -0,0 +1,36 @@ +from os.path import join +from copy import copy + +from biom import load_table + +import americangut.notebook_environment as agenv +from .util import collapse_full, collapse_taxonomy, get_existing_path + + +def cat_taxa_summaries(): + """Creates taxa summary files for each available summary category per site + """ + paths = copy(agenv.paths['collapsed']['notrim']['1k']) + out_dir = get_existing_path( + agenv.paths['populated-templates']['result-taxa']) + del paths['ag-biom'] + for name, path in paths.items(): + # consistent naming as stool for all participant items + name = name.replace('-biom', '').replace('fecal', 'stool') + table = load_table(get_existing_path(path)) + if len(name.split('-')) == 2: + # Have entire cohort of samples for site, so need to get medians + table = collapse_full(table) + table = collapse_taxonomy(table) + ids = table.ids(axis='observation') + for col in table.ids(): + if col == 'Unknown': + continue + cleaned_col = col.split('(')[0].strip().replace(' ', '_') + filename = '-'.join([name, cleaned_col]) + '.txt' + + with open(join(out_dir, filename), 'w') as f: + for otu, val in zip(ids, table.data(col)): + if val == 0.0: + continue + f.write('%s\t%s\n' % (otu, str(val))) diff --git a/americangut/util.py b/americangut/util.py index b2b9ccb..964f299 100644 --- a/americangut/util.py +++ b/americangut/util.py @@ -11,6 +11,7 @@ from itertools import izip from StringIO import StringIO from collections import defaultdict +from biom import Table from lxml import etree from skbio.parse.sequences import parse_fastq, parse_fasta @@ -635,3 +636,53 @@ def get_single_id_lists(map_, depths): single_ids[depths[idx - 1]].extend(single_ids[depths[idx]]) return single_ids + + +def collapse_taxonomy(_bt, level=5): + """Collapses OTUs by taxonomy + + Parameters + ---------- + _bt : biom table + Table to collapse + level : int, optional + Level to collapse to. 0=kingdom, 1=phylum,...,5=genus, 6=species + Default 5 + + Returns + ------- + biom table + Collapsed biom table + + Citations + --------- + [1] http://biom-format.org/documentation/table_objects.html + """ + def collapse_f(id_, md): + return '; '.join(md['taxonomy'][:level + 1]) + collapsed = _bt.collapse(collapse_f, axis='observation', norm=False) + return collapsed + + +def collapse_full(_bt): + """Collapses full biom table to median of each OTU + + Parameters + ---------- + _bt : biom table + Table to collapse + + Returns + ------- + biom table + Collapsed biom table, one sample containing median of each OTU, + normalized. + """ + num_obs = len(_bt.ids(axis='observation')) + table = Table(np.array( + [np.median(v) for v in _bt.iter_data(axis='observation')]).reshape( + (num_obs, 1)), + _bt.ids(axis='observation'), ['average'], + observation_metadata=_bt.metadata(axis='observation')) + table.norm(inplace=True) + return table diff --git a/ipynb/primary-processing/10-participant_pdfs.md b/ipynb/primary-processing/10-participant_pdfs.md index 4150a6e..39ec247 100644 --- a/ipynb/primary-processing/10-participant_pdfs.md +++ b/ipynb/primary-processing/10-participant_pdfs.md @@ -7,6 +7,7 @@ >>> import americangut.util as agu >>> import americangut.results_utils as agru >>> import americangut.per_sample as agps +>>> import americangut.per_category as agpc >>> import americangut.parallel as agpar ... >>> chp_path = agenv.activate('10-populated-templates') @@ -48,6 +49,12 @@ >>> process_pdf = partial(agps.sample_type_processor, [create_pdf, aggregate], opts) ``` +We also need to write out the taxa summary files for each of the categories in the collapsed data. These will live in the same folder as the participant's taxonomy files. + +```python +>>> agpc.cat_taxa_summaries() +``` + ```python >>> partitions = [(process_pdf, ids)] >>> with open(successful_pdfs, 'w') as successful_pdfs_fp, open(unsuccessful_pdfs, 'w') as unsuccessful_pdfs_fp: diff --git a/tests/data/ag_testing/ag-fecal.biom b/tests/data/ag_testing/ag-fecal.biom new file mode 100644 index 0000000..5feba16 Binary files /dev/null and b/tests/data/ag_testing/ag-fecal.biom differ diff --git a/tests/data/ag_testing/ag-oral-flossing.biom b/tests/data/ag_testing/ag-oral-flossing.biom new file mode 100644 index 0000000..91c0aca Binary files /dev/null and b/tests/data/ag_testing/ag-oral-flossing.biom differ diff --git a/tests/test_per_category.py b/tests/test_per_category.py new file mode 100644 index 0000000..bf60370 --- /dev/null +++ b/tests/test_per_category.py @@ -0,0 +1,90 @@ +from os import makedirs, listdir +from os.path import join, dirname, realpath +from shutil import rmtree + +from unittest import TestCase, main +import americangut.notebook_environment as agenv +from americangut.util import get_existing_path +from americangut.per_category import cat_taxa_summaries + + +class PerCategoryTests(TestCase): + def setUp(self): + # Make expected directory structure + currpath = dirname(realpath(__file__)) + self.dirpath = join(currpath, '../agp_processing/10-populated-templates/taxa') + makedirs(self.dirpath) + self.path = agenv.paths['collapsed']['notrim']['1k'] + agenv.paths['collapsed']['notrim']['1k'] = \ + {'ag-biom': + '../tests/data/ag_testing/category_test.biom', + 'ag-fecal': + '../tests/data/ag_testing/ag-fecal.biom', + 'ag-oral-flossing': + '../tests/data/ag_testing/ag-oral-flossing.biom'} + + def tearDown(self): + rmtree(self.dirpath, ignore_errors=True) + agenv.paths['collapsed']['notrim']['1k'] = self.path + + def test_cat_taxa_summaries(self): + cat_taxa_summaries() + path = get_existing_path( + '../agp_processing/10-populated-templates/taxa') + exp = ['ag-oral-flossing-Daily.txt', 'ag-oral-flossing-Never.txt', + 'ag-oral-flossing-Occasionally.txt', + 'ag-oral-flossing-Rarely.txt', 'ag-oral-flossing-Regularly.txt', + 'ag-stool-average.txt'] + files = listdir(path) + self.assertItemsEqual(files, exp) + + with open(join(path, 'ag-oral-flossing-Rarely.txt')) as f: + obs = f.read() + exp = """k__Bacteria; p__Fusobacteria; c__Fusobacteriia; o__Fusobacteriales; f__Fusobacteriaceae; g__Fusobacterium\t0.0110430107527 +k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Planococcaceae; g__\t0.00196774193548 +k__Bacteria; p__Bacteroidetes; c__Flavobacteriia; o__Flavobacteriales; f__[Weeksellaceae]; g__Chryseobacterium\t0.0125913978495 +k__Bacteria; p__Bacteroidetes; c__Sphingobacteriia; o__Sphingobacteriales; f__Sphingobacteriaceae; g__Sphingobacterium\t0.00608602150538 +k__Bacteria; p__Firmicutes; c__Erysipelotrichi; o__Erysipelotrichales; f__Erysipelotrichaceae; g__Bulleidia\t0.00269892473118 +k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Neisseriales; f__Neisseriaceae; g__Neisseria\t0.074935483871 +k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Oribacterium\t0.00749462365591 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__\t0.017311827957 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Moraxellaceae; g__Enhydrobacter\t0.000150537634409 +k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__\t0.00667741935484 +k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Caulobacterales; f__Caulobacteraceae; g__Brevundimonas\t0.00996774193548 +k__Bacteria; p__Bacteroidetes; c__Flavobacteriia; o__Flavobacteriales; f__[Weeksellaceae]; g__\t0.00139784946237 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Klebsiella\t0.00174193548387 +k__Bacteria; p__Firmicutes; c__Bacilli; o__Bacillales; f__Staphylococcaceae; g__Staphylococcus\t0.00756989247312 +k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Prevotellaceae; g__Prevotella\t0.0509569892473 +k__Bacteria; p__Fusobacteria; c__Fusobacteriia; o__Fusobacteriales; f__Leptotrichiaceae; g__Leptotrichia\t0.0143333333333 +k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Streptococcaceae; g__Streptococcus\t0.138817204301 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Pseudomonadaceae; g__\t0.0051935483871 +k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__[Paraprevotellaceae]; g__[Prevotella]\t0.00132258064516 +k__Bacteria; p__Proteobacteria; c__Epsilonproteobacteria; o__Campylobacterales; f__Campylobacteraceae; g__Campylobacter\t0.00133333333333 +k__Bacteria; p__Firmicutes; c__Bacilli; o__Gemellales; f__Gemellaceae; g__\t0.0097311827957 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__\t0.0135268817204 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pasteurellales; f__Pasteurellaceae; g__Haemophilus\t0.0668279569892 +k__Bacteria; p__Proteobacteria; c__Alphaproteobacteria; o__Rhizobiales; f__Brucellaceae; g__Ochrobactrum\t0.00575268817204 +k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Veillonellaceae; g__Veillonella\t0.0709247311828 +k__Bacteria; p__Bacteroidetes; c__Bacteroidia; o__Bacteroidales; f__Porphyromonadaceae; g__Porphyromonas\t0.0165698924731 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pasteurellales; f__Pasteurellaceae; g__Aggregatibacter\t0.00410752688172 +k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Neisseriales; f__Neisseriaceae; g__\t0.00394623655914 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Moraxellaceae; g__Acinetobacter\t0.0247634408602 +k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Aerococcaceae; g__Alloiococcus\t0.000225806451613 +k__Bacteria; p__Firmicutes; c__Clostridia; o__Clostridiales; f__Lachnospiraceae; g__Moryella\t0.000612903225806 +k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Micrococcaceae; g__Rothia\t0.114602150538 +k__Bacteria; p__Actinobacteria; c__Coriobacteriia; o__Coriobacteriales; f__Coriobacteriaceae; g__Atopobium\t0.00761290322581 +k__Bacteria; p__Proteobacteria; c__Betaproteobacteria; o__Burkholderiales; f__Comamonadaceae; g__Alicycliphilus\t0.00530107526882 +k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Actinomycetaceae; g__Actinomyces\t0.0204408602151 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Xanthomonadales; f__Xanthomonadaceae; g__Stenotrophomonas\t0.0443655913978 +k__Bacteria; p__Actinobacteria; c__Actinobacteria; o__Actinomycetales; f__Corynebacteriaceae; g__Corynebacterium\t0.000440860215054 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Enterobacteriales; f__Enterobacteriaceae; g__Morganella\t0.000161290322581 +k__Bacteria; p__Firmicutes; c__Bacilli; o__Lactobacillales; f__Carnobacteriaceae; g__Granulicatella\t0.025247311828 +k__Bacteria; p__SR1; c__; o__; f__; g__\t0.00222580645161 +k__Bacteria; p__Proteobacteria; c__Gammaproteobacteria; o__Pseudomonadales; f__Pseudomonadaceae; g__Pseudomonas\t0.0665268817204 +""" + self.assertEqual(obs, exp) + + + +if __name__ == '__main__': + main() diff --git a/tests/test_util.py b/tests/test_util.py index c44e7e5..52d95d7 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -7,7 +7,8 @@ from StringIO import StringIO from unittest import TestCase, main -from numpy import array, nan +from numpy import array, nan, arange +from numpy.testing import assert_almost_equal from biom import Table from pandas.util.testing import assert_frame_equal @@ -15,7 +16,7 @@ slice_mapping_file, parse_mapping_file, verify_subset, concatenate_files, trim_fasta, count_samples, count_seqs, count_unique_participants, clean_and_reformat_mapping, - add_alpha_diversity, get_single_id_lists + add_alpha_diversity, get_single_id_lists, collapse_taxonomy, collapse_full ) __author__ = "Daniel McDonald" @@ -313,6 +314,37 @@ def test_get_single_id_list(self): 'unrare': ['A', 'B', 'C', 'D', 'E']} self.assertEqual(test, known) + def test_collapse_taxonomy(self): + obs = collapse_taxonomy(table) + exp = Table(array([[100.0, 105.0, 110.0, 115.0], + [44.0, 46.0, 48.0, 50.0], + [36.0, 39.0, 42.0, 45.0]]), + ['Bacteria; Firmicutes', 'Bacteria; Bacteroidetes', + 'Bacteria; Proteobacteria'], sample_ids, + sample_metadata=sample_metadata, observation_metadata=[ + {'collapsed_ids': ['O0', 'O1', 'O7', 'O8', 'O9']}, + {'collapsed_ids': ['O5', 'O6']}, + {'collapsed_ids': ['O2', 'O3', 'O4']}]) + self.assertEqual(obs, exp) + + def test_collapse_full(self): + obs = collapse_full(table) + exp = Table(array([[0.00769230769231], [0.0282051282051], + [0.0487179487179], [0.0692307692308], + [0.0897435897436], [0.110256410256], + [0.130769230769], [0.151282051282], + [0.171794871795], [0.192307692308]]), + observ_ids, ['average'], + observation_metadata=observ_metadata) + for r in range(10): + assert_almost_equal(obs[r, 0], exp[r, 0]) + self.assertEqual(obs.ids(), exp.ids()) + self.assertItemsEqual(obs.ids('observation'), exp.ids('observation')) + + obs_meta = [] + for _, _, m in obs.iter(axis='observation'): + obs_meta.append(m) + self.assertItemsEqual(obs_meta, observ_metadata) test_mapping = """#SampleIDs\tfoo\tbar a\t1\t123123 @@ -342,6 +374,24 @@ def test_get_single_id_list(self): E GAZ:stuff 56.0 UBERON:SKIN 37 """) +data = arange(40).reshape(10, 4) +sample_ids = ['S%d' % i for i in range(4)] +observ_ids = ['O%d' % i for i in range(10)] +sample_metadata = [{'environment': 'A'}, {'environment': 'B'}, + {'environment': 'A'}, {'environment': 'B'}] +observ_metadata = [{'taxonomy': ['Bacteria', 'Firmicutes']}, + {'taxonomy': ['Bacteria', 'Firmicutes']}, + {'taxonomy': ['Bacteria', 'Proteobacteria']}, + {'taxonomy': ['Bacteria', 'Proteobacteria']}, + {'taxonomy': ['Bacteria', 'Proteobacteria']}, + {'taxonomy': ['Bacteria', 'Bacteroidetes']}, + {'taxonomy': ['Bacteria', 'Bacteroidetes']}, + {'taxonomy': ['Bacteria', 'Firmicutes']}, + {'taxonomy': ['Bacteria', 'Firmicutes']}, + {'taxonomy': ['Bacteria', 'Firmicutes']}] +table = Table(data, observ_ids, sample_ids, observ_metadata, + sample_metadata, table_id='Example Table') + if __name__ == '__main__': main()