-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_ninjal.py
executable file
·51 lines (43 loc) · 1.5 KB
/
scrape_ninjal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
from __future__ import annotations
from typing import Tuple, List
import os
from argparse import ArgumentParser
import requests
from requests.models import Response
from bs4 import BeautifulSoup
from bs4.element import Tag
from packaging.version import Version
class Handler:
def __init__(self: Handler, dictionary: str) -> None:
self.dictionary: str = dictionary
return
def scrape(self: Handler) -> None:
domain: str = "https://clrd.ninjal.ac.jp"
url: str = domain + "/unidic/back_number.html"
r: Response = requests.get(url)
soup: BeautifulSoup = BeautifulSoup(r.text, "html.parser")
div: Tag = soup.find(
name="div", attrs={"id": f"unidic_{self.dictionary}"}
)
zips: List[Tuple[Version, str]] = list()
for a in div.find_all("a"):
href: str = a.get("href")
if href.endswith("-full.zip"):
version: str = os.path.basename(href).split("-")[-2]
zips.append((Version(version=version), domain + href))
zips = sorted(zips, key=lambda x: x[0], reverse=True)
print(zips[0][1])
return
def main() -> None:
parser = ArgumentParser(
description=("print the latest version of zip file")
)
parser.add_argument("dictionary", type=str, choices=["cwj", "csj"])
args = parser.parse_args()
handler = Handler(**vars(args))
handler.scrape()
return
if __name__ == "__main__":
main()