diff --git a/wiki/scrape/README.md b/wiki/scrape/README.md
index 99e2a22..6687eee 100644
--- a/wiki/scrape/README.md
+++ b/wiki/scrape/README.md
@@ -17,3 +17,8 @@ Either the integer or the name can be used as input. This generates lists of pag
 4. Convert the XML export into the dolma format with `python to-dolma.py --wiki ${wiki_url} --license ${license_str}`
 
 **TODO:** Is this exported format the exact same as the published mediawiki dumps to the point we can reuse code?
+
+The export format is the same as the wiki dump
+
+Wikisrchive scraps have ~3 versions, to use the same format as the dump and 1 has a unique format. Most of the wiki's
+that aren't online anymore use this old format.
diff --git a/wiki/scrape/to-dolma.py b/wiki/scrape/to-dolma.py
index 97dc480..d14e7c7 100644
--- a/wiki/scrape/to-dolma.py
+++ b/wiki/scrape/to-dolma.py
@@ -26,13 +26,28 @@
     default=f"data/{SOURCE_NAME}/raw/documents/",
     help="Where the dolma formatted data goes.",
 )
+parser.add_argument(
+    "--source",
+    choices=["wikiscrape", "wikiarchive", "wikidump"],
+    default="wikiscrape",
+    help="Where does the data come from?",
+)
 parser.add_argument(
     "--filename", default=None, help="The base filename for our chat data."
 )
 parser.add_argument(
     "--shard_size", type=int, default=1, help="Size, in GB, for each shard."
 )
-parser.add_argument("--last_author", action="store_true", help="")
+parser.add_argument(
+    "--last_author",
+    action="store_true",
+    help="Should we only include the most recent author? (Faster)",
+)
+parser.add_argument(
+    "--include_redirects",
+    action="store_true",
+    help="Should we skip pages that are redirects to others?",
+)
 
 
 def main(args):
@@ -57,13 +72,16 @@ def main(args):
     pages = map(
         functools.partial(
             format_dolma,
-            source_name=SOURCE_NAME,
+            source_name=args.source,
             wiki=args.wiki,
             license=license,
             all_authors=not args.last_author,
+            skip_redirect=not args.include_redirects,
         ),
         pages,
     )
+    # When we filter out pages based on things like redirects, they may be None
+    pages = filter(lambda p: p is not None, pages)
     to_dolma(pages, args.output_dir, args.filename, args.shard_size)
 
 
@@ -73,7 +91,10 @@ def format_dolma(
     wiki: str,
     license: PermissiveLicenses,
     all_authors: bool = True,
+    skip_redirect: bool = True,
 ):
+    if skip_redirect and [x for x in xml if x.tag.endswith("redirect")]:
+        return None
     revisions = [r for r in xml if r.tag.endswith("revision")]
     # TODO Handle if this fails and add logging.
     text = [t for t in revisions[-1] if t.tag.endswith("text")][0].text