From c286f130eba87b5803bdeef8615ce503ce0676e5 Mon Sep 17 00:00:00 2001
From: Brian Lester <blester125@users.noreply.github.com>
Date: Fri, 18 Oct 2024 11:26:54 -0400
Subject: [PATCH] Add code to process the Python PEPs dataset (#94)

* Add code to process the Python PEPs dataset.

Python PEPs are long documents of intermixed prose and code that
describe possible changes to the python language. All PEPs are in the
public domain once published.

* Filter out the few Open Publication Licenses PEPs

* Update documentation on PEP licenses and use pypandoc
---
 README.md         |   5 +++
 pep/.gitignore    |   1 +
 pep/README.md     |  33 ++++++++++++++
 pep/preprocess.py | 112 ++++++++++++++++++++++++++++++++++++++++++++++
 pep/to_dolma.py   |  94 ++++++++++++++++++++++++++++++++++++++
 requirements.txt  |   1 +
 6 files changed, 246 insertions(+)
 create mode 100644 pep/.gitignore
 create mode 100644 pep/README.md
 create mode 100644 pep/preprocess.py
 create mode 100644 pep/to_dolma.py

diff --git a/README.md b/README.md
index 481419b..081163b 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,11 @@
 
 Repo to hold code and track issues for the collection of permissively licensed data
 
+## Installation
+
+The majority of packages required for data creation can be installed with `pip install -r requirements.txt`. You all need to run `pip install -e .` to get access to the `licensed_pile` shared utility library.
+
+If you are on a system that don't support automatic installation of pandoc with `pypandoc_binary`, change it to `pypandoc` in the `requirements.txt` and and install pandoc manually.
 
 ## Tips
 
diff --git a/pep/.gitignore b/pep/.gitignore
new file mode 100644
index 0000000..60baa9c
--- /dev/null
+++ b/pep/.gitignore
@@ -0,0 +1 @@
+data/*
diff --git a/pep/README.md b/pep/README.md
new file mode 100644
index 0000000..da05d90
--- /dev/null
+++ b/pep/README.md
@@ -0,0 +1,33 @@
+# Python PEP
+
+The majority of PEPs should be placed in the public domain, as seen here https://peps.python.org/pep-0001/#pep-review-resolution:
+
+> Copyright/license – Each new PEP must be placed under a dual license of public domain and CC0-1.0-Universal (see this PEP for an example).
+
+However some are published under the Open Publication License, as seen here https://peps.python.org/pep-0009/
+
+> Update your References and Copyright section.  Usually you'll place your PEP into the public domain, in which case just leave the "Copyright" section alone.  Alternatively, you can use the Open Publication License[3], but public domain is still strongly preferred.
+
+
+## Collecting the Data
+
+1. Clone the peps repository https://github.com/python/peps
+2. run `python to_dolma.py --peps /path/to/cloned/repo`
+3. Install pandoc
+4. run `python preprocess.py`
+
+### Alternative Approaches
+
+An alternative to Pandoc would be to use `docutils` and `rst2txt` as they have some python specific features (like converting the ``:pep:`00NN``` to `PEP NN`). However, the formatting in the `rst2txt` writer was so slow that things never finished.
+
+``` python
+def clean_rst(text):
+    from docutils.core import publish_string
+    import rst2txt
+    try:
+        return publish_string(source=text, writer=rst2txt.Writer()).decode("utf-8")
+    except:
+        logger = logs.get_logger()
+        logger.error("Failed to parse rst", exc_info=True)
+        return text
+```
diff --git a/pep/preprocess.py b/pep/preprocess.py
new file mode 100644
index 0000000..ab545d7
--- /dev/null
+++ b/pep/preprocess.py
@@ -0,0 +1,112 @@
+"""Preprocess Python PEPs."""
+
+import argparse
+import multiprocessing as mp
+import re
+from datetime import datetime
+
+import pypandoc
+
+from licensed_pile import logs, utils
+from licensed_pile.write import ShardParallelProcessor
+
+parser = argparse.ArgumentParser(description="Preprocess raw peps in the dolma format.")
+parser.add_argument(
+    "--input",
+    default="data/peps-dolma/raw",
+    help="The input version, this directory should be where the `documents` dir lives.",
+)
+parser.add_argument(
+    "--output",
+    default="data/peps-dolma/v0",
+    help="The output version, this directory should be where the `documents` dir will live.",
+)
+parser.add_argument(
+    "--debug",
+    action="store_true",
+    help="Should we log when documents are not changed by preprocessing.",
+)
+parser.add_argument(
+    "--processes",
+    type=int,
+    default=mp.cpu_count(),
+    help="Number of processors for multicore.",
+)
+parser.add_argument(
+    "--meta",
+    help="Location to store Dolma Metadata information.",
+)
+
+
+logs.configure_logging()
+
+
+def extract_created(text):
+    if m := re.search(r"^Created: (?P<date>.*)$", text, re.MULTILINE):
+        return m.group("date").strip()
+
+
+def parse_date(date):
+    return datetime.strptime(date, "%d-%b-%Y")
+
+
+def extract_authors(text):
+    if m := re.search(
+        r"^Author: (?P<authors>.*?)^.*?:", text, re.MULTILINE | re.DOTALL
+    ):
+        return m.group("authors")
+
+
+def parse_authors(authors):
+    authors = re.sub(r"<.*?>", "", authors)
+    authors = authors.split(",")
+    return sorted([a_ for a in authors if (a_ := a.strip())])
+
+
+def process_pep(text):
+    return re.sub(r":pep:`(\d{1,4})`", r"PEP \1", text)
+
+
+def clean_rst(text):
+    return pypandoc.convert_text(text, "plain", format="rst").strip()
+
+
+class PEPParallel(ShardParallelProcessor):
+    @classmethod
+    def process_example(cls, example, **kwargs):
+        logger = cls.get_logger()
+
+        with logger(id=example["id"]):
+            logger.debug("Processing example")
+            pep = example["text"]
+
+            created = extract_created(pep)
+            created = parse_date(created)
+            example["created"] = created.isoformat()
+
+            authors = extract_authors(pep)
+            example["metadata"]["authors"] = parse_authors(authors)
+
+            # Update this if the implementation of clean_rst changes.
+            example["metadata"]["pandoc_version"] = pypandoc.get_pandoc_version()
+
+            pep = process_pep(pep)
+            example["text"] = clean_rst(pep)
+            return example
+
+
+def main(args):
+    with utils.maybe_temp_dir(path=args.meta) as meta_dir:
+        processor = PEPParallel(
+            source_prefix=utils.dolma_input(args.input, "*.jsonl.gz"),
+            destination_prefix=utils.dolma_output(args.output),
+            metadata_prefix=meta_dir,
+            num_processes=args.processes,
+        )
+        processor(debug=args.debug)
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn")
+    args = parser.parse_args()
+    main(args)
diff --git a/pep/to_dolma.py b/pep/to_dolma.py
new file mode 100644
index 0000000..6fb60ef
--- /dev/null
+++ b/pep/to_dolma.py
@@ -0,0 +1,94 @@
+"""Convert python pep's into the dolma format."""
+
+import argparse
+import glob
+import os
+import re
+from datetime import datetime
+
+import docutils.core
+
+from licensed_pile import logs
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.write import to_dolma
+
+SOURCE_NAME = "python-peps"
+
+parser = argparse.ArgumentParser(description="Convert peps to dolma.")
+parser.add_argument("--peps", required=True, help="The path to the cloned pep repo.")
+parser.add_argument(
+    "--output_dir",
+    default="data/peps-dolma/raw/documents/",
+    help="Where the dolma formatted data goes.",
+)
+parser.add_argument(
+    "--filename", default="peps.jsonl.gz", help="The base filename for shards."
+)
+parser.add_argument(
+    "--shard_size", type=int, default=1, help="Size, in GB, for each shard."
+)
+
+
+def extract_pep_number(file_name):
+    if m := re.match(r"^pep-(?P<num>\d{4}).rst$", file_name):
+        return m.group("num")
+
+
+def check_for_open_pub_license(text):
+    doc = docutils.core.publish_doctree(
+        text,
+        settings_overrides={
+            "file_insertion_enabled": False,
+            "report_level": 5,
+            "halt_level": 5,
+        },
+    )
+    for cr in doc.findall(
+        lambda s: s.tagname == "section"
+        and "copyright" in [name.lower() for name in s["names"]]
+    ):
+        for paragraph in cr.findall(lambda p: p.tagname == "paragraph"):
+            copyright_text = paragraph.rawsource.lower()
+            if "open publication license" in copyright_text:
+                return True
+            if "https://spdx.org/licenses/OPUBL-1.0.html" in copyright_text:
+                return True
+            if "http://www.opencontent.org/openpub/" in copyright_text:
+                return True
+    return False
+
+
+def format_dolma(path, source_name: str = SOURCE_NAME):
+    with open(path) as f:
+        text = f.read()
+    if check_for_open_pub_license(text):
+        logger = logs.get_logger()
+        logger.warning(f"Skipping {path} as it is Open Publication License.")
+        return None
+    pep_number = extract_pep_number(os.path.basename(path))
+    return {
+        "id": pep_number,
+        "text": text,
+        "source": source_name,
+        "added": datetime.utcnow().isoformat(),
+        "created": None,
+        "metadata": {
+            "license": str(PermissiveLicenses.PD),
+            "url": f"https://peps.python.org/pep-{pep_number}/",
+            "authors": None,
+            "pep_number": pep_number,
+        },
+    }
+
+
+def main(args):
+    pep_files = glob.iglob(os.path.join(args.peps, "peps", "pep-*.rst"))
+    pep_files = map(format_dolma, pep_files)
+    pep_files = (p for p in pep_files if p is not None)
+    to_dolma(pep_files, args.output_dir, args.filename, args.shard_size)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    logs.configure_logging()
+    main(args)
diff --git a/requirements.txt b/requirements.txt
index 1c64450..4f79581 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,6 +11,7 @@ pandas
 patool
 pre-commit
 pylatexenc
+pypandoc_binary
 pyunpack
 rdflib
 requests>=2.13