diff --git a/wiki/scrape/README.md b/wiki/scrape/README.md index 99e2a22..6687eee 100644 --- a/wiki/scrape/README.md +++ b/wiki/scrape/README.md @@ -17,3 +17,8 @@ Either the integer or the name can be used as input. This generates lists of pag 4. Convert the XML export into the dolma format with `python to-dolma.py --wiki ${wiki_url} --license ${license_str}` **TODO:** Is this exported format the exact same as the published mediawiki dumps to the point we can reuse code? + +The export format is the same as the wiki dump + +Wikisrchive scraps have ~3 versions, to use the same format as the dump and 1 has a unique format. Most of the wiki's +that aren't online anymore use this old format. diff --git a/wiki/scrape/to-dolma.py b/wiki/scrape/to-dolma.py index 97dc480..d14e7c7 100644 --- a/wiki/scrape/to-dolma.py +++ b/wiki/scrape/to-dolma.py @@ -26,13 +26,28 @@ default=f"data/{SOURCE_NAME}/raw/documents/", help="Where the dolma formatted data goes.", ) +parser.add_argument( + "--source", + choices=["wikiscrape", "wikiarchive", "wikidump"], + default="wikiscrape", + help="Where does the data come from?", +) parser.add_argument( "--filename", default=None, help="The base filename for our chat data." ) parser.add_argument( "--shard_size", type=int, default=1, help="Size, in GB, for each shard." ) -parser.add_argument("--last_author", action="store_true", help="") +parser.add_argument( + "--last_author", + action="store_true", + help="Should we only include the most recent author? (Faster)", +) +parser.add_argument( + "--include_redirects", + action="store_true", + help="Should we skip pages that are redirects to others?", +) def main(args): @@ -57,13 +72,16 @@ def main(args): pages = map( functools.partial( format_dolma, - source_name=SOURCE_NAME, + source_name=args.source, wiki=args.wiki, license=license, all_authors=not args.last_author, + skip_redirect=not args.include_redirects, ), pages, ) + # When we filter out pages based on things like redirects, they may be None + pages = filter(lambda p: p is not None, pages) to_dolma(pages, args.output_dir, args.filename, args.shard_size) @@ -73,7 +91,10 @@ def format_dolma( wiki: str, license: PermissiveLicenses, all_authors: bool = True, + skip_redirect: bool = True, ): + if skip_redirect and [x for x in xml if x.tag.endswith("redirect")]: + return None revisions = [r for r in xml if r.tag.endswith("revision")] # TODO Handle if this fails and add logging. text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text