Skip to content

Commit

Permalink
initial draft of PR2 fetching code
Browse files Browse the repository at this point in the history
  • Loading branch information
mikerobeson committed Jan 6, 2025
1 parent 86836b5 commit 7e242c3
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 0 deletions.
15 changes: 15 additions & 0 deletions rescript/citations.bib
Original file line number Diff line number Diff line change
Expand Up @@ -189,3 +189,18 @@ @article{olson2023introducing
year={2023},
publisher={Oxford University Press}
}

@article{Guillou2013pr2,
author = {Guillou, Laure and Bachar, Dipankar and Audic, Stéphane and Bass, David and Berney, Cédric and Bittner, Lucie and Boutte, Christophe and Burgaud, Gaétan and de Vargas, Colomban and Decelle, Johan and del Campo, Javier and Dolan, John R. and Dunthorn, Micah and Edvardsen, Bente and Holzmann, Maria and Kooistra, Wiebe H.C.F. and Lara, Enrique and Le Bescot, Noan and Logares, Ramiro and Mahé, Frédéric and Massana, Ramon and Montresor, Marina and Morard, Raphael and Not, Fabrice and Pawlowski, Jan and Probert, Ian and Sauvadet, Anne-Laure and Siano, Raffaele and Stoeck, Thorsten and Vaulot, Daniel and Zimmermann, Pascal and Christen, Richard},
title = {The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote Small Sub-Unit rRNA sequences with curated taxonomy},
journal = {Nucleic Acids Research},
volume = {41},
number = {D1},
pages = {D597-D604},
year = {2013},
month = {11},
issn = {0305-1048},
doi = {10.1093/nar/gks1160},
url = {https://doi.org/10.1093/nar/gks1160},
pmid = {23193267},
}
88 changes: 88 additions & 0 deletions rescript/get_pr2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# ----------------------------------------------------------------------------
# Copyright (c) 2019-2025, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
import tempfile
import shutil
import gzip
import warnings
from urllib.request import urlretrieve
from urllib.error import HTTPError
from pathlib import Path
from pandas import DataFrame
from q2_types.feature_data import (TaxonomyFormat, DNAFASTAFormat,
DNAIterator)

# Note: No need for '5.0.1' '4.14.1'. Documentaiton says to refer to
# the '5.0.0' and '4.14.0' files respectively.
# Also, might need to include extra code for 4.13 and earlier as
# the 16S and 18S files are separated. Note the slight differences
# in naming convention:
# pr2_version_5.0.0_SSU_mothur.fasta.gz
# VS
# pr2_version_4.13.0_16S_mothur.fasta.gz
# pr2_version_4.13.0_18S_mothur.fasta.gz


def get_pr2_data(
version: str = '5.0.0',
) -> (DNAIterator, DataFrame):

urls = _assemble_pr2_urls(version=version)
seqs, tax = _retrieve_data_from_pr2(urls)

print('\n Saving files...\n')
return seqs, tax


def _assemble_pr2_urls(version='5.0.0'):
urls_to_retrieve = []
base_url = ('https://github.com/pr2database/pr2database/releases/download/'
'v{ver}/pr2_version_{ver}_SSU_mothur.{ext}.gz')

for data_type in ['fasta', 'tax']:
urls_to_retrieve.append(base_url.format(**{'ver': version,
'ext': data_type}))
return urls_to_retrieve


def _retrieve_data_from_pr2(urls_to_retrieve):
# Perform check that the `urls_to_retriev` should only
# contain 2 files, a 'fasta' and 'taxonomy' file.

# seqs = DNAFASTAFormat()
# tax = HeaderlessTSVTaxonomyFormat()

print('\nDownloading and processing raw files ... \n')

with tempfile.TemporaryDirectory() as tmpdirname:
for url in urls_to_retrieve:
compressed_fn = Path(url).name
uncompressed_fn = Path(url).stem
in_path = os.path.join(tmpdirname, compressed_fn)
out_path = os.path.join(tmpdirname, uncompressed_fn)

try:
print('Retrieving data from {0}'.format(url))
urlretrieve(url, in_path)
except HTTPError:
msg = ("Unable to retrieve the followng file from PR2:\n"
+ url)
warnings.warn(msg, UserWarning)

print(' Unzipping {0}...\n'.format(in_path))
with gzip.open(in_path, 'rt') as gz_in:
with open(out_path, 'w') as gz_out:
shutil.copyfileobj(gz_in, gz_out)
if out_path.endswith('fasta'):
seqs = DNAFASTAFormat(out_path,
mode="r").view(DNAIterator)
elif out_path.endswith('tax'):
tax = TaxonomyFormat(out_path,
mode="r").view(DataFrame)
return seqs, tax

0 comments on commit 7e242c3

Please sign in to comment.