-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
180 lines (157 loc) · 8.54 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
import os
import random
import bs4
import requests
import time
from PIL import Image
from tqdm import tqdm
import argparse
import atexit
import json
def exit_handler(folder_name, pdf_var, pdf_pages, offset, last_chapter, current_pdfs):
all_files = os.listdir(f"{folder_name}/")
for item in all_files:
if item.endswith(".jpg"):
os.remove(os.path.join(folder_name, item))
current_pdfs += 1
if args.batch_size != -1:
pdf_var.save(f'{folder_name}/{folder_name}_Batch_{current_pdfs}-{(last_chapter - offset) % args.batch_size + 1}_chapters.pdf', save_all=True, append_images=pdf_pages)
else:
pdf_var.save(f'{folder_name}/{folder_name}-{last_chapter}_chapters.pdf', save_all=True, append_images=pdf_pages)
json_file = json.dumps({
"last_chapter": last_chapter + 1,
"current_pdfs": current_pdfs,
}, indent=4)
with open(f"{folder_name}/info.json", "w") as outfile:
outfile.write(json_file)
print(f"Finished downloading {last_chapter - offset + 1} chapters!")
# These should not be modified
session = None
args = None
headers_list = [
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"
},
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.3"
},
{
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.3"
},
]
default_json = {
"last_chapter": 0,
"current_pdfs": 0,
}
def download_chapters(base_url, manga_name, starting_chapter, current_pdfs):
# Inits
pdf = None
current_page = base_url
pdf_pages = []
iteration = starting_chapter
# Iterates through all the chapters until there are no more chapters
while current_page is not None:
html = session.get(current_page, headers=random.choice(headers_list), allow_redirects=False)
# Get html of current page
soup = bs4.BeautifulSoup(html.text, 'html.parser')
# Getting the images here
reader_container = soup.find("div", {"class": "container-chapter-reader"})
image_links = [x['src'] for x in reader_container.findChildren("img")]
# Here we iterate through all the images of the current chapter
for i in tqdm(range(len(image_links)), desc=f"Downloading chapter {iteration + 1}"):
image_url = image_links[i]
file_name = f"{manga_name}/image_{i + 1}.jpg"
# We update the headers to create a reference to the current chapter page
headers = random.choice(headers_list)
headers["Referer"] = current_page
with session.get(image_url, headers=headers) as response:
with open(file_name, "wb") as f:
f.write(response.content)
# We add it to the current image list and initialize the first image if it is not already
if pdf is None:
pdf = Image.open(file_name).convert('RGB')
else:
pdf_pages.append(Image.open(file_name).convert('RGB'))
# We remove the current image from the pc as it is already stocked in the memory and sleep if we need to
os.remove(file_name)
time.sleep(args.delay_between_requests)
atexit.unregister(exit_handler)
atexit.register(exit_handler, manga_name, pdf, pdf_pages, starting_chapter, iteration, current_pdfs)
# We check if there is a next chapter
next_chapter = soup.find("a", {"class": "navi-change-chapter-btn-next"})
if next_chapter is not None:
# If there is we go to the next page and create the pdf of the current one
current_page = next_chapter['href']
if args.batch_size != -1 and (iteration - starting_chapter + 1) % args.batch_size == 0:
current_pdfs += 1
pdf.save(f'{manga_name}/Batch_{current_pdfs}-{args.batch_size}_chapters.pdf', save_all=True, append_images=pdf_pages)
pdf_pages = []
pdf = None
iteration += 1
else:
# If not, then we end the program
current_page = None
if __name__ == '__main__':
parser = argparse.ArgumentParser(
prog='MangaScraper',
description='Download pdf mangas from manganato.com',
epilog='Program made by PxGluz')
parser.add_argument('-s', '-search-size', default=5, type=int, dest="search_size", help="how many results should be shown when searching")
parser.add_argument('-b', '-batch-size', default=-1, type=int, dest="batch_size", help="how many chapters should be saved in each pdf, leave blank for a single pdf with all chapters")
parser.add_argument('-d', '-delay-between-requests', default=0, type=float, dest="delay_between_requests", help="delay between each of the made requests")
args = parser.parse_args()
session = requests.Session()
const_link = "https://manganato.com/search/story/"
while True:
search_name = input("Please input the name of the manga you want downloaded: ").replace(' ', '_')
html_page = session.get(const_link + search_name.lower(), headers=random.choice(headers_list), allow_redirects=False)
search_soup = bs4.BeautifulSoup(html_page.text, 'html.parser')
search_results = search_soup.find("div", {"class": "panel-search-story"})
if search_results is not None:
elements = search_results.findChildren("a", {"class": "item-img bookmark_check"})[:args.search_size]
links = [x['href'] for x in elements]
titles = [x['title'] for x in elements]
while True:
print("\n0. Return to searching\n")
for i in range(len(titles)):
print(f"{i + 1}. {titles[i]}")
choice = int(input("Choose one of the results: "))
if choice < 0 or choice > len(titles):
print("\nInvalid input!\n")
else:
if choice != 0:
chapters_html = session.get(links[choice - 1], headers=random.choice(headers_list),
allow_redirects=False)
chapters_soup = bs4.BeautifulSoup(chapters_html.text, 'html.parser')
chapter_list = chapters_soup.find("ul", {"class": "row-content-chapter"})
if chapter_list is not None:
starting_chapter = 0
current_pdfs = 0
# Creates the dir for the pdfs
os.makedirs(titles[choice - 1], exist_ok=True)
dir_path = f"{titles[choice - 1]}/"
if not os.path.isfile(f"{dir_path}info.json"):
json_file = json.dumps(default_json, indent=4)
with open(f"{dir_path}info.json", "w") as outfile:
outfile.write(json_file)
else:
with open(f"{dir_path}info.json", "r") as openfile:
json_data = json.load(openfile)
starting_chapter = json_data["last_chapter"]
current_pdfs = json_data["current_pdfs"]
print(f"Skipping to chapter: {starting_chapter + 1}")
chapter_children = chapter_list.findChildren("a", {"class": "chapter-name text-nowrap"})
if starting_chapter >= len(chapter_children):
print("\nNo new chapters... Returning to search")
choice = 0
else:
starting_chapter_link = chapter_children[-(starting_chapter + 1)]["href"]
download_chapters(starting_chapter_link, titles[choice - 1], starting_chapter, current_pdfs)
else:
print("\nManga has no chapters... Returning to search")
choice = 0
break
if choice != 0:
break
else:
print("\nNo results... Returning to search")