Initial upload

danielpontello · Jan 6, 2023 · 3736369 · 3736369
commit 3736369
Show file tree

Hide file tree

Showing 5 changed files with 105 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.idea
+out
diff --git a/README.md b/README.md
@@ -0,0 +1,11 @@
+# mikucolle-scrape
+
+a simple scraper to download all card images from https://mikucolle.gamerch.com/
+
+![scraper screenshot](docs/mikuscrape.png)
+
+## Usage
+
+- Install requirements from requirements.txt: ```pip install -r requirements.txt```
+- Run main.py: ```python main.py```
+
diff --git a/docs/mikuscrape.png b/docs/mikuscrape.png
diff --git a/main.py b/main.py
@@ -0,0 +1,89 @@
+import os
+import requests
+import urllib.request
+
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+
+BASE_URL = "https://mikucolle.gamerch.com"
+CARD_PAGE = "https://mikucolle.gamerch.com/%E3%82%AB%E3%83%BC%E3%83%89%E4%B8%80%E8%A6%A7"
+OUTPUT_FOLDER = "out"
+MIKU = '''⠄⠄⠄⠄⠄⠄⣀⣀⠄⠄⠄⠄⣀⣀⣀⣀⣀⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄
+⠄⠄⠄⣠⣤⠞⡋⠉⠧⠶⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣄⢀⠏⠲⣄⠄⠄⠄⠄
+⠄⢀⡴⠋⢁⢐⣵⣶⣿⠟⣛⣿⣿⣿⠿⢿⣿⣦⣝⡻⣿⢇⡟⠄⣠⣿⣿⣷⣦⡀
+⠄⠸⢳⡜⢱⣿⣿⠛⡅⣿⣿⣿⡟⣱⣿⣦⡙⣿⣿⣿⡆⡜⠄⣀⢹⣿⣿⣿⣿⣿
+⠄⢰⣧⢱⣿⣿⢃⠾⣃⢿⣿⣿⢰⣿⣿⣿⠳⠘⣿⣿⣦⡙⢤⡻⠸⡿⠿⣿⣿⣿
+⠄⣿⡟⣼⣿⡏⣴⣿⣿⡜⣿⣿⢸⣿⣿⣿⣿⣷⠸⣿⣿⣿⢲⣙⢦⠄⠄⣼⣿⣿
+⢸⣿⡇⣿⣿⡇⣿⡏⠈⣷⣜⢿⢸⣿⣿⡟⠈⣿⣆⢹⣿⣿⠄⠙⣷⠄⠄⣿⣿⣿
+⣾⣿⡇⣿⣿⠃⣿⡇⠰⣿⣿⣶⣸⣿⣿⣇⠰⣿⣿⡆⣿⡟⠄⠄⡏⠄⢸⣿⣿⡟
+⠟⣵⣦⢹⣿⢸⣿⣿⣶⣿⣿⣥⣿⣿⣿⣿⣶⣿⣿⡇⣿⡇⣀⣤⠃⠄⡀⠟⠋⠄
+⡘⣿⡰⠊⠇⢾⣿⣿⣿⣿⣟⠻⣿⡿⣻⣿⣿⣿⣿⢃⡿⢰⡿⠋⠄⠄⠄⠄⣠⣾
+⣿⣌⠵⠋⠈⠈⠻⢿⣿⣿⣿⣿⣶⣾⣿⣿⣿⣿⡇⠸⣑⡥⢂⣼⡷⠂⠄⢸⣿⣿
+⣿⣿⡀⠄⠄⠄⠄⠄⢌⣙⡛⢛⠛⣛⠛⣛⢋⣥⡂⢴⡿⣱⣿⠟⠄⠄⠄⠘⣿⣿
+⣿⣿⣿⣷⣦⣄⣀⣀⡼⡿⣷⡜⡗⠴⠸⠟⣼⡿⣴⡓⢎⣛⠁⠄⠄⠄⠄⠄⢿⣿
+⣿⣿⣿⣿⣿⣿⠄⠙⠻⢧⣿⣿⡜⣼⢸⣎⣭⣹⢸⡿⣣⠞⢷⡀⠄⠄⠄⠄⢸⣿
+⣿⣿⣿⣿⣿⣿⠄⠄⠄⠄⣿⣿⡇⣿⢸⣿⣿⣿⡗⢨⠁⠄⠄⢳⡄⠄⠄⠄⢸⣿'''
+
+
+def download_html(page):
+    req = requests.get(page)
+    if req.status_code == 200:
+        resp = req.text
+        return resp
+    return None
+
+
+def extract_urls(html):
+    urls = []
+
+    bs = BeautifulSoup(html, "html.parser")
+    table = bs.find("table").tbody
+
+    for tr in table.find_all("tr"):
+        td = tr.find("td", {"data-col": "4"})
+        link = td.find("a")
+
+        full_path = BASE_URL + link['href']
+        urls.append(full_path)
+
+    return urls
+
+
+def download_images(url_list):
+    for url in tqdm(url_list):
+        html = download_html(url)
+
+        if html is not None:
+            bs = BeautifulSoup(html, "html.parser")
+            image = bs.find("img", {"class": "ui_wikidb_main_img"})
+            character = bs.find("a", {"class": "ui_page_match"})
+            name = bs.find("h2", {"id": "js_wikidb_main_name"})
+
+            image_url = image['src']
+            character_name = character['title']
+            filename = f"{name.text}.jpg"
+            character_dir = os.path.join(OUTPUT_FOLDER, character_name)
+
+            if not os.path.exists(character_dir):
+                os.makedirs(character_dir, exist_ok=True)
+
+            output_file = os.path.join(character_dir, filename)
+            urllib.request.urlretrieve(image_url, filename=output_file)
+
+
+def main():
+    print("MikuColle scraper!")
+    print(MIKU)
+    print()
+    print("downloading main page")
+    html = download_html(CARD_PAGE)
+    print("extracting card urls")
+    urls = extract_urls(html)
+    print("downloading card images")
+    download_images(urls)
+    print("done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+requests~=2.28.1
+beautifulsoup4~=4.11.1
+tqdm~=4.64.1