-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add code to process the Python PEPs dataset (#94)
* Add code to process the Python PEPs dataset. Python PEPs are long documents of intermixed prose and code that describe possible changes to the python language. All PEPs are in the public domain once published. * Filter out the few Open Publication Licenses PEPs * Update documentation on PEP licenses and use pypandoc
- Loading branch information
1 parent
98b768c
commit c286f13
Showing
6 changed files
with
246 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
data/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Python PEP | ||
|
||
The majority of PEPs should be placed in the public domain, as seen here https://peps.python.org/pep-0001/#pep-review-resolution: | ||
|
||
> Copyright/license – Each new PEP must be placed under a dual license of public domain and CC0-1.0-Universal (see this PEP for an example). | ||
However some are published under the Open Publication License, as seen here https://peps.python.org/pep-0009/ | ||
|
||
> Update your References and Copyright section. Usually you'll place your PEP into the public domain, in which case just leave the "Copyright" section alone. Alternatively, you can use the Open Publication License[3], but public domain is still strongly preferred. | ||
|
||
## Collecting the Data | ||
|
||
1. Clone the peps repository https://github.com/python/peps | ||
2. run `python to_dolma.py --peps /path/to/cloned/repo` | ||
3. Install pandoc | ||
4. run `python preprocess.py` | ||
|
||
### Alternative Approaches | ||
|
||
An alternative to Pandoc would be to use `docutils` and `rst2txt` as they have some python specific features (like converting the ``:pep:`00NN``` to `PEP NN`). However, the formatting in the `rst2txt` writer was so slow that things never finished. | ||
|
||
``` python | ||
def clean_rst(text): | ||
from docutils.core import publish_string | ||
import rst2txt | ||
try: | ||
return publish_string(source=text, writer=rst2txt.Writer()).decode("utf-8") | ||
except: | ||
logger = logs.get_logger() | ||
logger.error("Failed to parse rst", exc_info=True) | ||
return text | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
"""Preprocess Python PEPs.""" | ||
|
||
import argparse | ||
import multiprocessing as mp | ||
import re | ||
from datetime import datetime | ||
|
||
import pypandoc | ||
|
||
from licensed_pile import logs, utils | ||
from licensed_pile.write import ShardParallelProcessor | ||
|
||
parser = argparse.ArgumentParser(description="Preprocess raw peps in the dolma format.") | ||
parser.add_argument( | ||
"--input", | ||
default="data/peps-dolma/raw", | ||
help="The input version, this directory should be where the `documents` dir lives.", | ||
) | ||
parser.add_argument( | ||
"--output", | ||
default="data/peps-dolma/v0", | ||
help="The output version, this directory should be where the `documents` dir will live.", | ||
) | ||
parser.add_argument( | ||
"--debug", | ||
action="store_true", | ||
help="Should we log when documents are not changed by preprocessing.", | ||
) | ||
parser.add_argument( | ||
"--processes", | ||
type=int, | ||
default=mp.cpu_count(), | ||
help="Number of processors for multicore.", | ||
) | ||
parser.add_argument( | ||
"--meta", | ||
help="Location to store Dolma Metadata information.", | ||
) | ||
|
||
|
||
logs.configure_logging() | ||
|
||
|
||
def extract_created(text): | ||
if m := re.search(r"^Created: (?P<date>.*)$", text, re.MULTILINE): | ||
return m.group("date").strip() | ||
|
||
|
||
def parse_date(date): | ||
return datetime.strptime(date, "%d-%b-%Y") | ||
|
||
|
||
def extract_authors(text): | ||
if m := re.search( | ||
r"^Author: (?P<authors>.*?)^.*?:", text, re.MULTILINE | re.DOTALL | ||
): | ||
return m.group("authors") | ||
|
||
|
||
def parse_authors(authors): | ||
authors = re.sub(r"<.*?>", "", authors) | ||
authors = authors.split(",") | ||
return sorted([a_ for a in authors if (a_ := a.strip())]) | ||
|
||
|
||
def process_pep(text): | ||
return re.sub(r":pep:`(\d{1,4})`", r"PEP \1", text) | ||
|
||
|
||
def clean_rst(text): | ||
return pypandoc.convert_text(text, "plain", format="rst").strip() | ||
|
||
|
||
class PEPParallel(ShardParallelProcessor): | ||
@classmethod | ||
def process_example(cls, example, **kwargs): | ||
logger = cls.get_logger() | ||
|
||
with logger(id=example["id"]): | ||
logger.debug("Processing example") | ||
pep = example["text"] | ||
|
||
created = extract_created(pep) | ||
created = parse_date(created) | ||
example["created"] = created.isoformat() | ||
|
||
authors = extract_authors(pep) | ||
example["metadata"]["authors"] = parse_authors(authors) | ||
|
||
# Update this if the implementation of clean_rst changes. | ||
example["metadata"]["pandoc_version"] = pypandoc.get_pandoc_version() | ||
|
||
pep = process_pep(pep) | ||
example["text"] = clean_rst(pep) | ||
return example | ||
|
||
|
||
def main(args): | ||
with utils.maybe_temp_dir(path=args.meta) as meta_dir: | ||
processor = PEPParallel( | ||
source_prefix=utils.dolma_input(args.input, "*.jsonl.gz"), | ||
destination_prefix=utils.dolma_output(args.output), | ||
metadata_prefix=meta_dir, | ||
num_processes=args.processes, | ||
) | ||
processor(debug=args.debug) | ||
|
||
|
||
if __name__ == "__main__": | ||
mp.set_start_method("spawn") | ||
args = parser.parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
"""Convert python pep's into the dolma format.""" | ||
|
||
import argparse | ||
import glob | ||
import os | ||
import re | ||
from datetime import datetime | ||
|
||
import docutils.core | ||
|
||
from licensed_pile import logs | ||
from licensed_pile.licenses import PermissiveLicenses | ||
from licensed_pile.write import to_dolma | ||
|
||
SOURCE_NAME = "python-peps" | ||
|
||
parser = argparse.ArgumentParser(description="Convert peps to dolma.") | ||
parser.add_argument("--peps", required=True, help="The path to the cloned pep repo.") | ||
parser.add_argument( | ||
"--output_dir", | ||
default="data/peps-dolma/raw/documents/", | ||
help="Where the dolma formatted data goes.", | ||
) | ||
parser.add_argument( | ||
"--filename", default="peps.jsonl.gz", help="The base filename for shards." | ||
) | ||
parser.add_argument( | ||
"--shard_size", type=int, default=1, help="Size, in GB, for each shard." | ||
) | ||
|
||
|
||
def extract_pep_number(file_name): | ||
if m := re.match(r"^pep-(?P<num>\d{4}).rst$", file_name): | ||
return m.group("num") | ||
|
||
|
||
def check_for_open_pub_license(text): | ||
doc = docutils.core.publish_doctree( | ||
text, | ||
settings_overrides={ | ||
"file_insertion_enabled": False, | ||
"report_level": 5, | ||
"halt_level": 5, | ||
}, | ||
) | ||
for cr in doc.findall( | ||
lambda s: s.tagname == "section" | ||
and "copyright" in [name.lower() for name in s["names"]] | ||
): | ||
for paragraph in cr.findall(lambda p: p.tagname == "paragraph"): | ||
copyright_text = paragraph.rawsource.lower() | ||
if "open publication license" in copyright_text: | ||
return True | ||
if "https://spdx.org/licenses/OPUBL-1.0.html" in copyright_text: | ||
return True | ||
if "http://www.opencontent.org/openpub/" in copyright_text: | ||
return True | ||
return False | ||
|
||
|
||
def format_dolma(path, source_name: str = SOURCE_NAME): | ||
with open(path) as f: | ||
text = f.read() | ||
if check_for_open_pub_license(text): | ||
logger = logs.get_logger() | ||
logger.warning(f"Skipping {path} as it is Open Publication License.") | ||
return None | ||
pep_number = extract_pep_number(os.path.basename(path)) | ||
return { | ||
"id": pep_number, | ||
"text": text, | ||
"source": source_name, | ||
"added": datetime.utcnow().isoformat(), | ||
"created": None, | ||
"metadata": { | ||
"license": str(PermissiveLicenses.PD), | ||
"url": f"https://peps.python.org/pep-{pep_number}/", | ||
"authors": None, | ||
"pep_number": pep_number, | ||
}, | ||
} | ||
|
||
|
||
def main(args): | ||
pep_files = glob.iglob(os.path.join(args.peps, "peps", "pep-*.rst")) | ||
pep_files = map(format_dolma, pep_files) | ||
pep_files = (p for p in pep_files if p is not None) | ||
to_dolma(pep_files, args.output_dir, args.filename, args.shard_size) | ||
|
||
|
||
if __name__ == "__main__": | ||
args = parser.parse_args() | ||
logs.configure_logging() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ pandas | |
patool | ||
pre-commit | ||
pylatexenc | ||
pypandoc_binary | ||
pyunpack | ||
rdflib | ||
requests>=2.13 | ||
|