Skip to content

Commit

Permalink
Initial upload
Browse files Browse the repository at this point in the history
  • Loading branch information
danielpontello committed Jan 6, 2023
0 parents commit 3736369
Show file tree
Hide file tree
Showing 5 changed files with 105 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.idea
out
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# mikucolle-scrape

a simple scraper to download all card images from https://mikucolle.gamerch.com/

![scraper screenshot](docs/mikuscrape.png)

## Usage

- Install requirements from requirements.txt: ```pip install -r requirements.txt```
- Run main.py: ```python main.py```

Binary file added docs/mikuscrape.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
89 changes: 89 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import os
import requests
import urllib.request

from bs4 import BeautifulSoup
from tqdm import tqdm


BASE_URL = "https://mikucolle.gamerch.com"
CARD_PAGE = "https://mikucolle.gamerch.com/%E3%82%AB%E3%83%BC%E3%83%89%E4%B8%80%E8%A6%A7"
OUTPUT_FOLDER = "out"
MIKU = '''⠄⠄⠄⠄⠄⠄⣀⣀⠄⠄⠄⠄⣀⣀⣀⣀⣀⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄⠄
⠄⠄⠄⣠⣤⠞⡋⠉⠧⠶⣿⣿⣿⣿⣿⣿⣿⣿⣿⣷⣦⣄⢀⠏⠲⣄⠄⠄⠄⠄
⠄⢀⡴⠋⢁⢐⣵⣶⣿⠟⣛⣿⣿⣿⠿⢿⣿⣦⣝⡻⣿⢇⡟⠄⣠⣿⣿⣷⣦⡀
⠄⠸⢳⡜⢱⣿⣿⠛⡅⣿⣿⣿⡟⣱⣿⣦⡙⣿⣿⣿⡆⡜⠄⣀⢹⣿⣿⣿⣿⣿
⠄⢰⣧⢱⣿⣿⢃⠾⣃⢿⣿⣿⢰⣿⣿⣿⠳⠘⣿⣿⣦⡙⢤⡻⠸⡿⠿⣿⣿⣿
⠄⣿⡟⣼⣿⡏⣴⣿⣿⡜⣿⣿⢸⣿⣿⣿⣿⣷⠸⣿⣿⣿⢲⣙⢦⠄⠄⣼⣿⣿
⢸⣿⡇⣿⣿⡇⣿⡏⠈⣷⣜⢿⢸⣿⣿⡟⠈⣿⣆⢹⣿⣿⠄⠙⣷⠄⠄⣿⣿⣿
⣾⣿⡇⣿⣿⠃⣿⡇⠰⣿⣿⣶⣸⣿⣿⣇⠰⣿⣿⡆⣿⡟⠄⠄⡏⠄⢸⣿⣿⡟
⠟⣵⣦⢹⣿⢸⣿⣿⣶⣿⣿⣥⣿⣿⣿⣿⣶⣿⣿⡇⣿⡇⣀⣤⠃⠄⡀⠟⠋⠄
⡘⣿⡰⠊⠇⢾⣿⣿⣿⣿⣟⠻⣿⡿⣻⣿⣿⣿⣿⢃⡿⢰⡿⠋⠄⠄⠄⠄⣠⣾
⣿⣌⠵⠋⠈⠈⠻⢿⣿⣿⣿⣿⣶⣾⣿⣿⣿⣿⡇⠸⣑⡥⢂⣼⡷⠂⠄⢸⣿⣿
⣿⣿⡀⠄⠄⠄⠄⠄⢌⣙⡛⢛⠛⣛⠛⣛⢋⣥⡂⢴⡿⣱⣿⠟⠄⠄⠄⠘⣿⣿
⣿⣿⣿⣷⣦⣄⣀⣀⡼⡿⣷⡜⡗⠴⠸⠟⣼⡿⣴⡓⢎⣛⠁⠄⠄⠄⠄⠄⢿⣿
⣿⣿⣿⣿⣿⣿⠄⠙⠻⢧⣿⣿⡜⣼⢸⣎⣭⣹⢸⡿⣣⠞⢷⡀⠄⠄⠄⠄⢸⣿
⣿⣿⣿⣿⣿⣿⠄⠄⠄⠄⣿⣿⡇⣿⢸⣿⣿⣿⡗⢨⠁⠄⠄⢳⡄⠄⠄⠄⢸⣿'''


def download_html(page):
req = requests.get(page)
if req.status_code == 200:
resp = req.text
return resp
return None


def extract_urls(html):
urls = []

bs = BeautifulSoup(html, "html.parser")
table = bs.find("table").tbody

for tr in table.find_all("tr"):
td = tr.find("td", {"data-col": "4"})
link = td.find("a")

full_path = BASE_URL + link['href']
urls.append(full_path)

return urls


def download_images(url_list):
for url in tqdm(url_list):
html = download_html(url)

if html is not None:
bs = BeautifulSoup(html, "html.parser")
image = bs.find("img", {"class": "ui_wikidb_main_img"})
character = bs.find("a", {"class": "ui_page_match"})
name = bs.find("h2", {"id": "js_wikidb_main_name"})

image_url = image['src']
character_name = character['title']
filename = f"{name.text}.jpg"
character_dir = os.path.join(OUTPUT_FOLDER, character_name)

if not os.path.exists(character_dir):
os.makedirs(character_dir, exist_ok=True)

output_file = os.path.join(character_dir, filename)
urllib.request.urlretrieve(image_url, filename=output_file)


def main():
print("MikuColle scraper!")
print(MIKU)
print()
print("downloading main page")
html = download_html(CARD_PAGE)
print("extracting card urls")
urls = extract_urls(html)
print("downloading card images")
download_images(urls)
print("done!")


if __name__ == '__main__':
main()
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests~=2.28.1
beautifulsoup4~=4.11.1
tqdm~=4.64.1

0 comments on commit 3736369

Please sign in to comment.