biocore · EmbrietteH · May 10, 2016 · Apr 20, 2016 · Apr 20, 2016 · Apr 20, 2016
diff --git a/.travis.yml b/.travis.yml
@@ -16,6 +16,7 @@ before_install:
 install:
   - conda create --yes -n env_name python=$PYTHON_VERSION --file conda_requirements.txt
   - source activate env_name
+  - conda install -y "setuptools<20.7.0"
   - pip install -r pip_requirements.txt
   - pip install -e . --no-deps
 

diff --git a/americangut/per_category.py b/americangut/per_category.py
@@ -0,0 +1,36 @@
+from os.path import join
+from copy import copy
+
+from biom import load_table
+
+import americangut.notebook_environment as agenv
+from .util import collapse_full, collapse_taxonomy, get_existing_path
+
+
+def cat_taxa_summaries():
+    """Creates taxa summary files for each available summary category per site
+    """
+    paths = copy(agenv.paths['collapsed']['notrim']['1k'])
+    out_dir = get_existing_path(
+        agenv.path['populated-templates']['result-taxa'])
+    del paths['ag-biom']
+    for name, path in paths.items():
+        # consistent naming as stool for all participant items
+        name = name.replace('-biom', '').replace('fecal', 'stool')
+        table = load_table(get_existing_path(path))
+        if len(name.split('-')) == 2:
+            # Have entire cohort of samples for site, so need to get averages
+            table = collapse_full(table)
+        table = collapse_taxonomy(table)
+        ids = table.ids(axis='observation')
+        for col in table.ids():
+            if col == 'Unknown':
+                continue
+            cleaned_col = col.split('(')[0].strip().replace(' ', '_')
+            filename = '-'.join([name, cleaned_col]) + '.txt'
+
+            with open(join(out_dir, filename), 'w') as f:
+                for otu, val in zip(ids, table.data(col)):
+                    if val == 0.0:
+                        continue
+                    f.write('%s\t%s\n' % (otu, str(val)))
diff --git a/americangut/util.py b/americangut/util.py
@@ -11,6 +11,7 @@
 from itertools import izip
 from StringIO import StringIO
 from collections import defaultdict
+from biom import Table
 
 from lxml import etree
 from skbio.parse.sequences import parse_fastq, parse_fasta
@@ -635,3 +636,58 @@ def get_single_id_lists(map_, depths):
             single_ids[depths[idx - 1]].extend(single_ids[depths[idx]])
 
     return single_ids
+
+
+def collapse_taxonomy(_bt, level=5):
+    """Collapses OTUs by taxonomy
+
+    Parameters
+    ----------
+    _bt : biom table
+        Table to collapse
+    level : int, optional
+        Level to collapse to. 0=kingdom, 1=phylum,...,5=genus, 6=species
+        Default 5
+
+    Returns
+    -------
+    biom table
+        Collapsed biom table
+
+    Citations
+    ---------
+    [1] http://biom-format.org/documentation/table_objects.html
+    """
+    def collapse_f(id_, md):
+        return '; '.join(md['taxonomy'][:level + 1])
+    collapsed = _bt.collapse(collapse_f, axis='observation', norm=False)
+    return collapsed
+
+
+def collapse_full(_bt):
+    """Collapses full biom table to median of each OTU
+
+    Parameters
+    ----------
+    _bt : biom table
+        Table to collapse
+
+    Returns
+    -------
+    biom table
+        Collapsed biom table, one sample containing median of each OTU,
+        normalized.
+    """
+    data = np.empty([_bt.length('observation'), 1])
+    obs = []
+    observation_metadata = []
+    pos = 0
+    for vals, id_, metadata in _bt.iter(axis='observation'):
+        data[pos][0] = np.median(vals)
+        obs.append(id_)
+        observation_metadata.append(metadata)
+        pos += 1
+    table = Table(data, obs, ['average'],
+                  observation_metadata=observation_metadata)
+    table.norm(inplace=True)
+    return table
diff --git a/ipynb/primary-processing/10-participant_pdfs.md b/ipynb/primary-processing/10-participant_pdfs.md
@@ -7,6 +7,7 @@
 >>> import americangut.util as agu
 >>> import americangut.results_utils as agru
 >>> import americangut.per_sample as agps
+>>> import americangut.per_category as agpc
 >>> import americangut.parallel as agpar
 ...
 >>> chp_path = agenv.activate('10-populated-templates')
@@ -48,6 +49,12 @@
 >>> process_pdf = partial(agps.sample_type_processor, [create_pdf, aggregate], opts)
 ```
 
+We also need to write out the taxa summary files for each of the categories in the collapsed data. These will live in the same folder as the participant's taxonomy files.
+
+```python
+>>> agpc.cat_taxa_summaries()
+```
+
 ```python
 >>> partitions = [(process_pdf, ids)]
 >>> with open(successful_pdfs, 'w') as successful_pdfs_fp, open(unsuccessful_pdfs, 'w') as unsuccessful_pdfs_fp:

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -7,15 +7,16 @@
 from StringIO import StringIO
 from unittest import TestCase, main
 
-from numpy import array, nan
+from numpy import array, nan, arange
+from numpy.testing import assert_almost_equal
 from biom import Table
 from pandas.util.testing import assert_frame_equal
 
 from americangut.util import (
     slice_mapping_file, parse_mapping_file,
     verify_subset, concatenate_files, trim_fasta, count_samples,
     count_seqs, count_unique_participants, clean_and_reformat_mapping,
-    add_alpha_diversity, get_single_id_lists
+    add_alpha_diversity, get_single_id_lists, collapse_taxonomy, collapse_full
 )
 
 __author__ = "Daniel McDonald"
@@ -313,6 +314,37 @@ def test_get_single_id_list(self):
                  'unrare': ['A', 'B', 'C', 'D', 'E']}
         self.assertEqual(test, known)
 
+    def test_collapse_taxonomy(self):
+        obs = collapse_taxonomy(table)
+        exp = Table(array([[100.0,  105.0,  110.0,  115.0],
+                           [44.0,   46.0,   48.0,   50.0],
+                           [36.0,   39.0,   42.0,   45.0]]),
+                    ['Bacteria; Firmicutes', 'Bacteria; Bacteroidetes',
+                     'Bacteria; Proteobacteria'], sample_ids,
+                    sample_metadata=sample_metadata, observation_metadata=[
+                    {'collapsed_ids': ['O0', 'O1', 'O7', 'O8', 'O9']},
+                    {'collapsed_ids': ['O5', 'O6']},
+                    {'collapsed_ids': ['O2', 'O3', 'O4']}])
+        self.assertEqual(obs, exp)
+
+    def test_collapse_full(self):
+        obs = collapse_full(table)
+        exp = Table(array([[0.00769230769231], [0.0282051282051],
+                           [0.0487179487179], [0.0692307692308],
+                           [0.0897435897436], [0.110256410256],
+                           [0.130769230769], [0.151282051282],
+                           [0.171794871795], [0.192307692308]]),
+                    observ_ids, ['average'],
+                    observation_metadata=observ_metadata)
+        for r in range(10):
+            assert_almost_equal(obs[r, 0],  exp[r, 0])
+        self.assertEqual(obs.ids(), exp.ids())
+        self.assertItemsEqual(obs.ids('observation'), exp.ids('observation'))
+
+        obs_meta = []
+        for _, _, m in obs.iter(axis='observation'):
+            obs_meta.append(m)
+        self.assertItemsEqual(obs_meta, observ_metadata)
 
 test_mapping = """#SampleIDs\tfoo\tbar
 a\t1\t123123
@@ -342,6 +374,24 @@ def test_get_single_id_list(self):
 E	GAZ:stuff	56.0	UBERON:SKIN	37
 """)
 
+data = arange(40).reshape(10, 4)
+sample_ids = ['S%d' % i for i in range(4)]
+observ_ids = ['O%d' % i for i in range(10)]
+sample_metadata = [{'environment': 'A'}, {'environment': 'B'},
+                   {'environment': 'A'}, {'environment': 'B'}]
+observ_metadata = [{'taxonomy': ['Bacteria', 'Firmicutes']},
+                   {'taxonomy': ['Bacteria', 'Firmicutes']},
+                   {'taxonomy': ['Bacteria', 'Proteobacteria']},
+                   {'taxonomy': ['Bacteria', 'Proteobacteria']},
+                   {'taxonomy': ['Bacteria', 'Proteobacteria']},
+                   {'taxonomy': ['Bacteria', 'Bacteroidetes']},
+                   {'taxonomy': ['Bacteria', 'Bacteroidetes']},
+                   {'taxonomy': ['Bacteria', 'Firmicutes']},
+                   {'taxonomy': ['Bacteria', 'Firmicutes']},
+                   {'taxonomy': ['Bacteria', 'Firmicutes']}]
+table = Table(data, observ_ids, sample_ids, observ_metadata,
+              sample_metadata, table_id='Example Table')
+
 
 if __name__ == '__main__':
     main()