Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate taxa by summary data #208

Merged
merged 11 commits into from
May 10, 2016
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ before_install:
install:
- conda create --yes -n env_name python=$PYTHON_VERSION --file conda_requirements.txt
- source activate env_name
- conda install -y "setuptools<20.7.0"
- pip install -r pip_requirements.txt
- pip install -e . --no-deps

Expand Down
36 changes: 36 additions & 0 deletions americangut/per_category.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from os.path import join
from copy import copy

from biom import load_table

import americangut.notebook_environment as agenv
from .util import collapse_full, collapse_taxonomy, get_existing_path


def cat_taxa_summaries():

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a test for this function?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

"""Creates taxa summary files for each available summary category per site
"""
paths = copy(agenv.paths['collapsed']['notrim']['1k'])
out_dir = get_existing_path(
agenv.path['populated-templates']['result-taxa'])

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be agenv.paths ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, fixed

del paths['ag-biom']

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why are you deleting this path?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah because it's a biom table including all sites ..?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correct

for name, path in paths.items():
# consistent naming as stool for all participant items
name = name.replace('-biom', '').replace('fecal', 'stool')
table = load_table(get_existing_path(path))
if len(name.split('-')) == 2:
# Have entire cohort of samples for site, so need to get averages

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get the medians?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

table = collapse_full(table)
table = collapse_taxonomy(table)
ids = table.ids(axis='observation')
for col in table.ids():
if col == 'Unknown':
continue
cleaned_col = col.split('(')[0].strip().replace(' ', '_')

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the text in braces not useful anymore? ex. Occasionally (1-2 times/week) -> Occasionally

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

never mind, this is only done for file naming ..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah, makes the file names managable

filename = '-'.join([name, cleaned_col]) + '.txt'

with open(join(out_dir, filename), 'w') as f:
for otu, val in zip(ids, table.data(col)):
if val == 0.0:
continue
f.write('%s\t%s\n' % (otu, str(val)))
56 changes: 56 additions & 0 deletions americangut/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from itertools import izip
from StringIO import StringIO
from collections import defaultdict
from biom import Table

from lxml import etree
from skbio.parse.sequences import parse_fastq, parse_fasta
Expand Down Expand Up @@ -635,3 +636,58 @@ def get_single_id_lists(map_, depths):
single_ids[depths[idx - 1]].extend(single_ids[depths[idx]])

return single_ids


def collapse_taxonomy(_bt, level=5):
"""Collapses OTUs by taxonomy

Parameters
----------
_bt : biom table
Table to collapse
level : int, optional
Level to collapse to. 0=kingdom, 1=phylum,...,5=genus, 6=species
Default 5

Returns
-------
biom table
Collapsed biom table

Citations
---------
[1] http://biom-format.org/documentation/table_objects.html
"""
def collapse_f(id_, md):
return '; '.join(md['taxonomy'][:level + 1])
collapsed = _bt.collapse(collapse_f, axis='observation', norm=False)
return collapsed


def collapse_full(_bt):
Copy link

@ekopylova ekopylova May 2, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is it not possible to pass a collapse function to table.collapse() here as you did in collapse_taxonomy? ex. as done qiime's collapse_samples.py _collapse_to_median

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried that, but they need to collapse by sample not by taxa, and I don't think that's possible.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so in collapse_samples.py they don't collapse by samples?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry, collapse by sample data, not by taxa metadata. I'll dig into it and see if it's possible.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might be able to do it with collapse but expressing median in that context will be weird as it would need to be a rolling median -- iirc, you can control the function that describes how values are smashed together, or it's possible we took that out as there was performance overhead. But, I think there is a more direct route:

return Table([np.median(v) for v in _bt.iter_data(axis='observation')], 
             _bt.ids(axis='observation'),
             observation_metadata=_bt.metadata(axis='observation'))

Unfortunately, because of the rolling issue, you can't do the "cleanest" thing which would be a reduce:

# won't work... :(
return Table.reduce(np.median, axis='observation')

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...at one point, we had discussed adding in Table.apply but that didn't make it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

little more complicated than the given return, but greatly simplified. Thanks.

"""Collapses full biom table to median of each OTU

Parameters
----------
_bt : biom table
Table to collapse

Returns
-------
biom table
Collapsed biom table, one sample containing median of each OTU,
normalized.
"""
data = np.empty([_bt.length('observation'), 1])
obs = []
observation_metadata = []
pos = 0
for vals, id_, metadata in _bt.iter(axis='observation'):
data[pos][0] = np.median(vals)
obs.append(id_)
observation_metadata.append(metadata)
pos += 1
table = Table(data, obs, ['average'],
observation_metadata=observation_metadata)
table.norm(inplace=True)
return table
7 changes: 7 additions & 0 deletions ipynb/primary-processing/10-participant_pdfs.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
>>> import americangut.util as agu
>>> import americangut.results_utils as agru
>>> import americangut.per_sample as agps
>>> import americangut.per_category as agpc
>>> import americangut.parallel as agpar
...
>>> chp_path = agenv.activate('10-populated-templates')
Expand Down Expand Up @@ -48,6 +49,12 @@
>>> process_pdf = partial(agps.sample_type_processor, [create_pdf, aggregate], opts)
```

We also need to write out the taxa summary files for each of the categories in the collapsed data. These will live in the same folder as the participant's taxonomy files.

```python
>>> agpc.cat_taxa_summaries()
```

```python
>>> partitions = [(process_pdf, ids)]
>>> with open(successful_pdfs, 'w') as successful_pdfs_fp, open(unsuccessful_pdfs, 'w') as unsuccessful_pdfs_fp:
Expand Down
54 changes: 52 additions & 2 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,16 @@
from StringIO import StringIO
from unittest import TestCase, main

from numpy import array, nan
from numpy import array, nan, arange
from numpy.testing import assert_almost_equal
from biom import Table
from pandas.util.testing import assert_frame_equal

from americangut.util import (
slice_mapping_file, parse_mapping_file,
verify_subset, concatenate_files, trim_fasta, count_samples,
count_seqs, count_unique_participants, clean_and_reformat_mapping,
add_alpha_diversity, get_single_id_lists
add_alpha_diversity, get_single_id_lists, collapse_taxonomy, collapse_full
)

__author__ = "Daniel McDonald"
Expand Down Expand Up @@ -313,6 +314,37 @@ def test_get_single_id_list(self):
'unrare': ['A', 'B', 'C', 'D', 'E']}
self.assertEqual(test, known)

def test_collapse_taxonomy(self):
obs = collapse_taxonomy(table)
exp = Table(array([[100.0, 105.0, 110.0, 115.0],
[44.0, 46.0, 48.0, 50.0],
[36.0, 39.0, 42.0, 45.0]]),
['Bacteria; Firmicutes', 'Bacteria; Bacteroidetes',
'Bacteria; Proteobacteria'], sample_ids,
sample_metadata=sample_metadata, observation_metadata=[
{'collapsed_ids': ['O0', 'O1', 'O7', 'O8', 'O9']},
{'collapsed_ids': ['O5', 'O6']},
{'collapsed_ids': ['O2', 'O3', 'O4']}])
self.assertEqual(obs, exp)

def test_collapse_full(self):
obs = collapse_full(table)
exp = Table(array([[0.00769230769231], [0.0282051282051],
[0.0487179487179], [0.0692307692308],
[0.0897435897436], [0.110256410256],
[0.130769230769], [0.151282051282],
[0.171794871795], [0.192307692308]]),
observ_ids, ['average'],
observation_metadata=observ_metadata)
for r in range(10):
assert_almost_equal(obs[r, 0], exp[r, 0])
self.assertEqual(obs.ids(), exp.ids())
self.assertItemsEqual(obs.ids('observation'), exp.ids('observation'))

obs_meta = []
for _, _, m in obs.iter(axis='observation'):
obs_meta.append(m)
self.assertItemsEqual(obs_meta, observ_metadata)

test_mapping = """#SampleIDs\tfoo\tbar
a\t1\t123123
Expand Down Expand Up @@ -342,6 +374,24 @@ def test_get_single_id_list(self):
E GAZ:stuff 56.0 UBERON:SKIN 37
""")

data = arange(40).reshape(10, 4)
sample_ids = ['S%d' % i for i in range(4)]
observ_ids = ['O%d' % i for i in range(10)]
sample_metadata = [{'environment': 'A'}, {'environment': 'B'},
{'environment': 'A'}, {'environment': 'B'}]
observ_metadata = [{'taxonomy': ['Bacteria', 'Firmicutes']},
{'taxonomy': ['Bacteria', 'Firmicutes']},
{'taxonomy': ['Bacteria', 'Proteobacteria']},
{'taxonomy': ['Bacteria', 'Proteobacteria']},
{'taxonomy': ['Bacteria', 'Proteobacteria']},
{'taxonomy': ['Bacteria', 'Bacteroidetes']},
{'taxonomy': ['Bacteria', 'Bacteroidetes']},
{'taxonomy': ['Bacteria', 'Firmicutes']},
{'taxonomy': ['Bacteria', 'Firmicutes']},
{'taxonomy': ['Bacteria', 'Firmicutes']}]
table = Table(data, observ_ids, sample_ids, observ_metadata,
sample_metadata, table_id='Example Table')


if __name__ == '__main__':
main()