Add code to process the Python PEPs dataset (#94)

* Add code to process the Python PEPs dataset. Python PEPs are long documents of intermixed prose and code that describe possible changes to the python language. All PEPs are in the public domain once published. * Filter out the few Open Publication Licenses PEPs * Update documentation on PEP licenses and use pypandoc
r-three · Oct 18, 2024 · c286f13 · c286f13
1 parent 98b768c
commit c286f13
Show file tree

Hide file tree

Showing 6 changed files with 246 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -2,6 +2,11 @@
 
 Repo to hold code and track issues for the collection of permissively licensed data
 
+## Installation
+
+The majority of packages required for data creation can be installed with `pip install -r requirements.txt`. You all need to run `pip install -e .` to get access to the `licensed_pile` shared utility library.
+
+If you are on a system that don't support automatic installation of pandoc with `pypandoc_binary`, change it to `pypandoc` in the `requirements.txt` and and install pandoc manually.
 
 ## Tips
 

diff --git a/pep/.gitignore b/pep/.gitignore
@@ -0,0 +1 @@
+data/*
diff --git a/pep/README.md b/pep/README.md
@@ -0,0 +1,33 @@
+# Python PEP
+
+The majority of PEPs should be placed in the public domain, as seen here https://peps.python.org/pep-0001/#pep-review-resolution:
+
+> Copyright/license – Each new PEP must be placed under a dual license of public domain and CC0-1.0-Universal (see this PEP for an example).
+
+However some are published under the Open Publication License, as seen here https://peps.python.org/pep-0009/
+
+> Update your References and Copyright section.  Usually you'll place your PEP into the public domain, in which case just leave the "Copyright" section alone.  Alternatively, you can use the Open Publication License[3], but public domain is still strongly preferred.
+
+
+## Collecting the Data
+
+1. Clone the peps repository https://github.com/python/peps
+2. run `python to_dolma.py --peps /path/to/cloned/repo`
+3. Install pandoc
+4. run `python preprocess.py`
+
+### Alternative Approaches
+
+An alternative to Pandoc would be to use `docutils` and `rst2txt` as they have some python specific features (like converting the ``:pep:`00NN``` to `PEP NN`). However, the formatting in the `rst2txt` writer was so slow that things never finished.
+
+``` python
+def clean_rst(text):
+    from docutils.core import publish_string
+    import rst2txt
+    try:
+        return publish_string(source=text, writer=rst2txt.Writer()).decode("utf-8")
+    except:
+        logger = logs.get_logger()
+        logger.error("Failed to parse rst", exc_info=True)
+        return text
+```
diff --git a/pep/preprocess.py b/pep/preprocess.py
@@ -0,0 +1,112 @@
+"""Preprocess Python PEPs."""
+
+import argparse
+import multiprocessing as mp
+import re
+from datetime import datetime
+
+import pypandoc
+
+from licensed_pile import logs, utils
+from licensed_pile.write import ShardParallelProcessor
+
+parser = argparse.ArgumentParser(description="Preprocess raw peps in the dolma format.")
+parser.add_argument(
+    "--input",
+    default="data/peps-dolma/raw",
+    help="The input version, this directory should be where the `documents` dir lives.",
+)
+parser.add_argument(
+    "--output",
+    default="data/peps-dolma/v0",
+    help="The output version, this directory should be where the `documents` dir will live.",
+)
+parser.add_argument(
+    "--debug",
+    action="store_true",
+    help="Should we log when documents are not changed by preprocessing.",
+)
+parser.add_argument(
+    "--processes",
+    type=int,
+    default=mp.cpu_count(),
+    help="Number of processors for multicore.",
+)
+parser.add_argument(
+    "--meta",
+    help="Location to store Dolma Metadata information.",
+)
+
+
+logs.configure_logging()
+
+
+def extract_created(text):
+    if m := re.search(r"^Created: (?P<date>.*)$", text, re.MULTILINE):
+        return m.group("date").strip()
+
+
+def parse_date(date):
+    return datetime.strptime(date, "%d-%b-%Y")
+
+
+def extract_authors(text):
+    if m := re.search(
+        r"^Author: (?P<authors>.*?)^.*?:", text, re.MULTILINE | re.DOTALL
+    ):
+        return m.group("authors")
+
+
+def parse_authors(authors):
+    authors = re.sub(r"<.*?>", "", authors)
+    authors = authors.split(",")
+    return sorted([a_ for a in authors if (a_ := a.strip())])
+
+
+def process_pep(text):
+    return re.sub(r":pep:`(\d{1,4})`", r"PEP \1", text)
+
+
+def clean_rst(text):
+    return pypandoc.convert_text(text, "plain", format="rst").strip()
+
+
+class PEPParallel(ShardParallelProcessor):
+    @classmethod
+    def process_example(cls, example, **kwargs):
+        logger = cls.get_logger()
+
+        with logger(id=example["id"]):
+            logger.debug("Processing example")
+            pep = example["text"]
+
+            created = extract_created(pep)
+            created = parse_date(created)
+            example["created"] = created.isoformat()
+
+            authors = extract_authors(pep)
+            example["metadata"]["authors"] = parse_authors(authors)
+
+            # Update this if the implementation of clean_rst changes.
+            example["metadata"]["pandoc_version"] = pypandoc.get_pandoc_version()
+
+            pep = process_pep(pep)
+            example["text"] = clean_rst(pep)
+            return example
+
+
+def main(args):
+    with utils.maybe_temp_dir(path=args.meta) as meta_dir:
+        processor = PEPParallel(
+            source_prefix=utils.dolma_input(args.input, "*.jsonl.gz"),
+            destination_prefix=utils.dolma_output(args.output),
+            metadata_prefix=meta_dir,
+            num_processes=args.processes,
+        )
+        processor(debug=args.debug)
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn")
+    args = parser.parse_args()
+    main(args)
diff --git a/pep/to_dolma.py b/pep/to_dolma.py
@@ -0,0 +1,94 @@
+"""Convert python pep's into the dolma format."""
+
+import argparse
+import glob
+import os
+import re
+from datetime import datetime
+
+import docutils.core
+
+from licensed_pile import logs
+from licensed_pile.licenses import PermissiveLicenses
+from licensed_pile.write import to_dolma
+
+SOURCE_NAME = "python-peps"
+
+parser = argparse.ArgumentParser(description="Convert peps to dolma.")
+parser.add_argument("--peps", required=True, help="The path to the cloned pep repo.")
+parser.add_argument(
+    "--output_dir",
+    default="data/peps-dolma/raw/documents/",
+    help="Where the dolma formatted data goes.",
+)
+parser.add_argument(
+    "--filename", default="peps.jsonl.gz", help="The base filename for shards."
+)
+parser.add_argument(
+    "--shard_size", type=int, default=1, help="Size, in GB, for each shard."
+)
+
+
+def extract_pep_number(file_name):
+    if m := re.match(r"^pep-(?P<num>\d{4}).rst$", file_name):
+        return m.group("num")
+
+
+def check_for_open_pub_license(text):
+    doc = docutils.core.publish_doctree(
+        text,
+        settings_overrides={
+            "file_insertion_enabled": False,
+            "report_level": 5,
+            "halt_level": 5,
+        },
+    )
+    for cr in doc.findall(
+        lambda s: s.tagname == "section"
+        and "copyright" in [name.lower() for name in s["names"]]
+    ):
+        for paragraph in cr.findall(lambda p: p.tagname == "paragraph"):
+            copyright_text = paragraph.rawsource.lower()
+            if "open publication license" in copyright_text:
+                return True
+            if "https://spdx.org/licenses/OPUBL-1.0.html" in copyright_text:
+                return True
+            if "http://www.opencontent.org/openpub/" in copyright_text:
+                return True
+    return False
+
+
+def format_dolma(path, source_name: str = SOURCE_NAME):
+    with open(path) as f:
+        text = f.read()
+    if check_for_open_pub_license(text):
+        logger = logs.get_logger()
+        logger.warning(f"Skipping {path} as it is Open Publication License.")
+        return None
+    pep_number = extract_pep_number(os.path.basename(path))
+    return {
+        "id": pep_number,
+        "text": text,
+        "source": source_name,
+        "added": datetime.utcnow().isoformat(),
+        "created": None,
+        "metadata": {
+            "license": str(PermissiveLicenses.PD),
+            "url": f"https://peps.python.org/pep-{pep_number}/",
+            "authors": None,
+            "pep_number": pep_number,
+        },
+    }
+
+
+def main(args):
+    pep_files = glob.iglob(os.path.join(args.peps, "peps", "pep-*.rst"))
+    pep_files = map(format_dolma, pep_files)
+    pep_files = (p for p in pep_files if p is not None)
+    to_dolma(pep_files, args.output_dir, args.filename, args.shard_size)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    logs.configure_logging()
+    main(args)
diff --git a/requirements.txt b/requirements.txt
@@ -11,6 +11,7 @@ pandas
 patool
 pre-commit
 pylatexenc
+pypandoc_binary
 pyunpack
 rdflib
 requests>=2.13