From 7e242c3668ceb19ea839482a9c86a6c877173a5e Mon Sep 17 00:00:00 2001
From: mikerobeson <soilbdelloid@gmail.com>
Date: Mon, 6 Jan 2025 16:10:32 -0600
Subject: [PATCH] initial draft of PR2 fetching code

---
 rescript/citations.bib | 15 +++++++
 rescript/get_pr2.py    | 88 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 103 insertions(+)
 create mode 100644 rescript/get_pr2.py

diff --git a/rescript/citations.bib b/rescript/citations.bib
index c5a507c..950d0bb 100644
--- a/rescript/citations.bib
+++ b/rescript/citations.bib
@@ -189,3 +189,18 @@ @article{olson2023introducing
   year={2023},
   publisher={Oxford University Press}
 }
+
+@article{Guillou2013pr2,
+    author = {Guillou, Laure and Bachar, Dipankar and Audic, Stéphane and Bass, David and Berney, Cédric and Bittner, Lucie and Boutte, Christophe and Burgaud, Gaétan and de Vargas, Colomban and Decelle, Johan and del Campo, Javier and Dolan, John R. and Dunthorn, Micah and Edvardsen, Bente and Holzmann, Maria and Kooistra, Wiebe H.C.F. and Lara, Enrique and Le Bescot, Noan and Logares, Ramiro and Mahé, Frédéric and Massana, Ramon and Montresor, Marina and Morard, Raphael and Not, Fabrice and Pawlowski, Jan and Probert, Ian and Sauvadet, Anne-Laure and Siano, Raffaele and Stoeck, Thorsten and Vaulot, Daniel and Zimmermann, Pascal and Christen, Richard},
+    title = {The Protist Ribosomal Reference database (PR2): a catalog of unicellular eukaryote Small Sub-Unit rRNA sequences with curated taxonomy},
+    journal = {Nucleic Acids Research},
+    volume = {41},
+    number = {D1},
+    pages = {D597-D604},
+    year = {2013},
+    month = {11},
+    issn = {0305-1048},
+    doi = {10.1093/nar/gks1160},
+    url = {https://doi.org/10.1093/nar/gks1160},
+    pmid = {23193267},
+}
diff --git a/rescript/get_pr2.py b/rescript/get_pr2.py
new file mode 100644
index 0000000..72defe5
--- /dev/null
+++ b/rescript/get_pr2.py
@@ -0,0 +1,88 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2019-2025, QIIME 2 development team.
+#
+# Distributed under the terms of the Modified BSD License.
+#
+# The full license is in the file LICENSE, distributed with this software.
+# ----------------------------------------------------------------------------
+
+import os
+import tempfile
+import shutil
+import gzip
+import warnings
+from urllib.request import urlretrieve
+from urllib.error import HTTPError
+from pathlib import Path
+from pandas import DataFrame
+from q2_types.feature_data import (TaxonomyFormat, DNAFASTAFormat,
+                                   DNAIterator)
+
+# Note: No need for '5.0.1' '4.14.1'. Documentaiton says to refer to
+# the '5.0.0' and '4.14.0' files respectively.
+# Also, might need to include extra code for 4.13 and earlier as
+# the 16S and 18S files are separated. Note the slight differences
+# in naming convention:
+#   pr2_version_5.0.0_SSU_mothur.fasta.gz
+#  VS
+#   pr2_version_4.13.0_16S_mothur.fasta.gz
+#   pr2_version_4.13.0_18S_mothur.fasta.gz
+
+
+def get_pr2_data(
+    version: str = '5.0.0',
+        ) -> (DNAIterator, DataFrame):
+
+    urls = _assemble_pr2_urls(version=version)
+    seqs, tax = _retrieve_data_from_pr2(urls)
+
+    print('\n Saving files...\n')
+    return seqs, tax
+
+
+def _assemble_pr2_urls(version='5.0.0'):
+    urls_to_retrieve = []
+    base_url = ('https://github.com/pr2database/pr2database/releases/download/'
+                'v{ver}/pr2_version_{ver}_SSU_mothur.{ext}.gz')
+
+    for data_type in ['fasta', 'tax']:
+        urls_to_retrieve.append(base_url.format(**{'ver': version,
+                                                   'ext': data_type}))
+    return urls_to_retrieve
+
+
+def _retrieve_data_from_pr2(urls_to_retrieve):
+    # Perform check that the `urls_to_retriev` should only
+    # contain 2 files, a 'fasta' and 'taxonomy' file.
+
+    # seqs = DNAFASTAFormat()
+    # tax = HeaderlessTSVTaxonomyFormat()
+
+    print('\nDownloading and processing raw files ... \n')
+
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        for url in urls_to_retrieve:
+            compressed_fn = Path(url).name
+            uncompressed_fn = Path(url).stem
+            in_path = os.path.join(tmpdirname, compressed_fn)
+            out_path = os.path.join(tmpdirname, uncompressed_fn)
+
+            try:
+                print('Retrieving data from {0}'.format(url))
+                urlretrieve(url, in_path)
+            except HTTPError:
+                msg = ("Unable to retrieve the followng file from PR2:\n"
+                       + url)
+                warnings.warn(msg, UserWarning)
+
+            print('  Unzipping {0}...\n'.format(in_path))
+            with gzip.open(in_path, 'rt') as gz_in:
+                with open(out_path, 'w') as gz_out:
+                    shutil.copyfileobj(gz_in, gz_out)
+                    if out_path.endswith('fasta'):
+                        seqs = DNAFASTAFormat(out_path,
+                                              mode="r").view(DNAIterator)
+                    elif out_path.endswith('tax'):
+                        tax = TaxonomyFormat(out_path,
+                                             mode="r").view(DataFrame)
+    return seqs, tax