Skip to content

Commit

Permalink
refactor(gccp): make cli args more consistent with pywikibot
Browse files Browse the repository at this point in the history
  • Loading branch information
davla committed Feb 15, 2025
1 parent affece2 commit 5df1d77
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 1,144 deletions.
45 changes: 25 additions & 20 deletions gccp/gccp-download.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import argparse
import os
import os.path

Expand All @@ -7,10 +6,10 @@

"""
Script that downloads pages to text files. Arguments:
--fam: name of family in Pywikibot repository.
--enpages: path where EN pages will be downloaded (optional, default data/gccp-enpages).
--itpages: path where IT pages will be downloaded (optional, default data/gccp-itpages).
--overwrite: 'yes' to overwrite existing pages, otherwise they are skipped.
-fam:<name> name of family in Pywikibot repository.
-enpages:<path> path where EN pages will be downloaded (optional, default data/gccp-enpages).
-itpages:<path> path where IT pages will be downloaded (optional, default data/gccp-itpages).
-overwrite:<yes|no> 'yes' to overwrite existing pages, otherwise they are skipped.
"""


Expand All @@ -36,35 +35,41 @@ def title_en_to_it(title_en):
# main function
def main():
# parse args
parser = argparse.ArgumentParser()
parser.add_argument("--fam", default="encypok")
parser.add_argument("--enpages", default="data/gccp-enpages")
parser.add_argument("--itpages", default="data/gccp-itpages")
parser.add_argument("--overwrite", default="no")
args = parser.parse_args()
local_args = pywikibot.handle_args()

args = {
"fam": "encypok",
"enpages": "data/gccp-enpages",
"itpages": "data/gccp-itpages",
"overwrite": "no",
}
for arg in local_args:
arg_name, _, arg_value = arg[1:].partition(":")
args[arg_name] = arg_value

# setup
overwrite = args.overwrite.strip().lower() == "yes"
if not os.path.isdir(args.enpages):
os.mkdir(args.enpages)
if not os.path.isdir(args.itpages):
os.mkdir(args.itpages)
site_it = pywikibot.Site("it", fam=args.fam)
site_en = pywikibot.Site("en", fam=args.fam)
overwrite = args["overwrite"].strip().lower() == "yes"
if not os.path.isdir(args["enpages"]):
os.mkdir(args["enpages"])
if not os.path.isdir(args["itpages"]):
os.mkdir(args["itpages"])
site_it = pywikibot.Site("it", fam=args["fam"])
site_en = pywikibot.Site("en", fam=args["fam"])
# retrieve list of EN pages and filter them
cat_en = pywikibot.Category(site_en, "Category:Pokémon TCG Pocket species by name") # fmt: skip
pages_en = pagegenerators.CategorizedPageGenerator(cat_en, recurse=True)
pages_en = [p for p in pages_en if p.title().endswith(" (TCG Pocket)")]
for page_en in pages_en:
# retrieve EN page
page_en_file = os.path.join(args.enpages, f"{page_en.title()}.txt")
page_en_file = os.path.join(args["enpages"], f"{page_en.title()}.txt")
if not os.path.isfile(page_en_file) or overwrite:
print(f"Downloading EN page: {page_en_file}")
download_page(page=page_en, dest_file=page_en_file)
else:
print(f"Skipping already existing EN page: {page_en_file}")
# retrieve IT page if exists
title_it = title_en_to_it(page_en.title())
page_it_file = os.path.join(args.itpages, f"{title_it}.txt")
page_it_file = os.path.join(args["itpages"], f"{title_it}.txt")
page_it = pywikibot.Page(site_it, title_it)
if not page_it.exists():
print(f"Skipping missing IT page: {page_it_file}")
Expand Down
Loading

0 comments on commit 5df1d77

Please sign in to comment.