Skip to content

Commit

Permalink
Add code to process the Python PEPs dataset (#94)
Browse files Browse the repository at this point in the history
* Add code to process the Python PEPs dataset.

Python PEPs are long documents of intermixed prose and code that
describe possible changes to the python language. All PEPs are in the
public domain once published.

* Filter out the few Open Publication Licenses PEPs

* Update documentation on PEP licenses and use pypandoc
  • Loading branch information
blester125 authored Oct 18, 2024
1 parent 98b768c commit c286f13
Show file tree
Hide file tree
Showing 6 changed files with 246 additions and 0 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

Repo to hold code and track issues for the collection of permissively licensed data

## Installation

The majority of packages required for data creation can be installed with `pip install -r requirements.txt`. You all need to run `pip install -e .` to get access to the `licensed_pile` shared utility library.

If you are on a system that don't support automatic installation of pandoc with `pypandoc_binary`, change it to `pypandoc` in the `requirements.txt` and and install pandoc manually.

## Tips

Expand Down
1 change: 1 addition & 0 deletions pep/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/*
33 changes: 33 additions & 0 deletions pep/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Python PEP

The majority of PEPs should be placed in the public domain, as seen here https://peps.python.org/pep-0001/#pep-review-resolution:

> Copyright/license – Each new PEP must be placed under a dual license of public domain and CC0-1.0-Universal (see this PEP for an example).
However some are published under the Open Publication License, as seen here https://peps.python.org/pep-0009/

> Update your References and Copyright section. Usually you'll place your PEP into the public domain, in which case just leave the "Copyright" section alone. Alternatively, you can use the Open Publication License[3], but public domain is still strongly preferred.

## Collecting the Data

1. Clone the peps repository https://github.com/python/peps
2. run `python to_dolma.py --peps /path/to/cloned/repo`
3. Install pandoc
4. run `python preprocess.py`

### Alternative Approaches

An alternative to Pandoc would be to use `docutils` and `rst2txt` as they have some python specific features (like converting the ``:pep:`00NN``` to `PEP NN`). However, the formatting in the `rst2txt` writer was so slow that things never finished.

``` python
def clean_rst(text):
from docutils.core import publish_string
import rst2txt
try:
return publish_string(source=text, writer=rst2txt.Writer()).decode("utf-8")
except:
logger = logs.get_logger()
logger.error("Failed to parse rst", exc_info=True)
return text
```
112 changes: 112 additions & 0 deletions pep/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
"""Preprocess Python PEPs."""

import argparse
import multiprocessing as mp
import re
from datetime import datetime

import pypandoc

from licensed_pile import logs, utils
from licensed_pile.write import ShardParallelProcessor

parser = argparse.ArgumentParser(description="Preprocess raw peps in the dolma format.")
parser.add_argument(
"--input",
default="data/peps-dolma/raw",
help="The input version, this directory should be where the `documents` dir lives.",
)
parser.add_argument(
"--output",
default="data/peps-dolma/v0",
help="The output version, this directory should be where the `documents` dir will live.",
)
parser.add_argument(
"--debug",
action="store_true",
help="Should we log when documents are not changed by preprocessing.",
)
parser.add_argument(
"--processes",
type=int,
default=mp.cpu_count(),
help="Number of processors for multicore.",
)
parser.add_argument(
"--meta",
help="Location to store Dolma Metadata information.",
)


logs.configure_logging()


def extract_created(text):
if m := re.search(r"^Created: (?P<date>.*)$", text, re.MULTILINE):
return m.group("date").strip()


def parse_date(date):
return datetime.strptime(date, "%d-%b-%Y")


def extract_authors(text):
if m := re.search(
r"^Author: (?P<authors>.*?)^.*?:", text, re.MULTILINE | re.DOTALL
):
return m.group("authors")


def parse_authors(authors):
authors = re.sub(r"<.*?>", "", authors)
authors = authors.split(",")
return sorted([a_ for a in authors if (a_ := a.strip())])


def process_pep(text):
return re.sub(r":pep:`(\d{1,4})`", r"PEP \1", text)


def clean_rst(text):
return pypandoc.convert_text(text, "plain", format="rst").strip()


class PEPParallel(ShardParallelProcessor):
@classmethod
def process_example(cls, example, **kwargs):
logger = cls.get_logger()

with logger(id=example["id"]):
logger.debug("Processing example")
pep = example["text"]

created = extract_created(pep)
created = parse_date(created)
example["created"] = created.isoformat()

authors = extract_authors(pep)
example["metadata"]["authors"] = parse_authors(authors)

# Update this if the implementation of clean_rst changes.
example["metadata"]["pandoc_version"] = pypandoc.get_pandoc_version()

pep = process_pep(pep)
example["text"] = clean_rst(pep)
return example


def main(args):
with utils.maybe_temp_dir(path=args.meta) as meta_dir:
processor = PEPParallel(
source_prefix=utils.dolma_input(args.input, "*.jsonl.gz"),
destination_prefix=utils.dolma_output(args.output),
metadata_prefix=meta_dir,
num_processes=args.processes,
)
processor(debug=args.debug)


if __name__ == "__main__":
mp.set_start_method("spawn")
args = parser.parse_args()
main(args)
94 changes: 94 additions & 0 deletions pep/to_dolma.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
"""Convert python pep's into the dolma format."""

import argparse
import glob
import os
import re
from datetime import datetime

import docutils.core

from licensed_pile import logs
from licensed_pile.licenses import PermissiveLicenses
from licensed_pile.write import to_dolma

SOURCE_NAME = "python-peps"

parser = argparse.ArgumentParser(description="Convert peps to dolma.")
parser.add_argument("--peps", required=True, help="The path to the cloned pep repo.")
parser.add_argument(
"--output_dir",
default="data/peps-dolma/raw/documents/",
help="Where the dolma formatted data goes.",
)
parser.add_argument(
"--filename", default="peps.jsonl.gz", help="The base filename for shards."
)
parser.add_argument(
"--shard_size", type=int, default=1, help="Size, in GB, for each shard."
)


def extract_pep_number(file_name):
if m := re.match(r"^pep-(?P<num>\d{4}).rst$", file_name):
return m.group("num")


def check_for_open_pub_license(text):
doc = docutils.core.publish_doctree(
text,
settings_overrides={
"file_insertion_enabled": False,
"report_level": 5,
"halt_level": 5,
},
)
for cr in doc.findall(
lambda s: s.tagname == "section"
and "copyright" in [name.lower() for name in s["names"]]
):
for paragraph in cr.findall(lambda p: p.tagname == "paragraph"):
copyright_text = paragraph.rawsource.lower()
if "open publication license" in copyright_text:
return True
if "https://spdx.org/licenses/OPUBL-1.0.html" in copyright_text:
return True
if "http://www.opencontent.org/openpub/" in copyright_text:
return True
return False


def format_dolma(path, source_name: str = SOURCE_NAME):
with open(path) as f:
text = f.read()
if check_for_open_pub_license(text):
logger = logs.get_logger()
logger.warning(f"Skipping {path} as it is Open Publication License.")
return None
pep_number = extract_pep_number(os.path.basename(path))
return {
"id": pep_number,
"text": text,
"source": source_name,
"added": datetime.utcnow().isoformat(),
"created": None,
"metadata": {
"license": str(PermissiveLicenses.PD),
"url": f"https://peps.python.org/pep-{pep_number}/",
"authors": None,
"pep_number": pep_number,
},
}


def main(args):
pep_files = glob.iglob(os.path.join(args.peps, "peps", "pep-*.rst"))
pep_files = map(format_dolma, pep_files)
pep_files = (p for p in pep_files if p is not None)
to_dolma(pep_files, args.output_dir, args.filename, args.shard_size)


if __name__ == "__main__":
args = parser.parse_args()
logs.configure_logging()
main(args)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ pandas
patool
pre-commit
pylatexenc
pypandoc_binary
pyunpack
rdflib
requests>=2.13
Expand Down

0 comments on commit c286f13

Please sign in to comment.