-
Notifications
You must be signed in to change notification settings - Fork 106
/
Copy pathbook118.py
114 lines (99 loc) · 3.45 KB
/
book118.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException
import base64
import time
import sys
import os
import shutil
import requests
from tqdm import trange
from img2pdf import conpdf
def download(url):
option = webdriver.ChromeOptions()
# option.add_argument('headless')
option.add_argument('log-level=3')
driver = webdriver.Chrome(
executable_path='.//chromedriver', options=option)
title = "output"
try:
# driver.implicitly_wait(15)
driver.set_page_load_timeout(15)
driver.get(url)
title = driver.title
except:
print("Timeout - start download anyway.")
print(title)
if 'ppt' in title:
import book118_PPT
driver.quit()
book118_PPT.download(url)
return
time.sleep(2)
try:
driver.find_element_by_id("agree_full").click()
except:
try:
driver.find_elements_by_class_name('big')[0].click()
except:
pass
finally:
time.sleep(1)
# driver.get(driver.find_element_by_id(
# "layer_new_view_iframe").get_attribute("src"))
# time.sleep(3)
while True:
try:
# 展开全部
elem_cont_button = driver.find_element_by_id("btn_preview_remain")
driver.execute_script(
"arguments[0].scrollIntoView(true);", elem_cont_button)
actions = ActionChains(driver)
actions.move_to_element(elem_cont_button).perform()
driver.execute_script("window.scrollBy(0, -100)")
time.sleep(2)
elem_cont_button.click()
except NoSuchElementException:
break
except Exception:
import traceback
traceback.print_exc()
finally:
time.sleep(1)
# 获取页数
num_of_pages = driver.find_element_by_class_name(
'counts').get_attribute('innerHTML')
num_of_pages = int(num_of_pages.split(' ')[-1])
if os.path.exists(f'./temp/{title}'):
shutil.rmtree(f'./temp/{title}')
os.makedirs(f'./temp/{title}')
elems = driver.find_elements_by_class_name("webpreview-item")
for pages in trange(num_of_pages):
try:
elem = elems[pages]
time.sleep(0.5)
actions = ActionChains(driver)
actions.move_to_element(elem).perform()
img = elem.find_element_by_tag_name('img')
count = 0
while count < 10 and img.get_attribute('data-src') == None and img.get_attribute('src') == None:
count += 1
time.sleep(1)
img_url = img.get_attribute('src')
if img_url is None or not 'http' in img_url:
img_url = "http:" + img.get_attribute('data-src')
res = requests.get(img_url)
with open(f"./temp/{title}/{pages}.png", "wb") as fh:
fh.write(res.content)
except Exception as e:
print("下载失败!\n%r" % e)
driver.quit()
return
driver.quit()
print('下载完毕,正在转码')
conpdf(f'output/{title}.pdf', f'temp/{title}', '.png')
if __name__ == '__main__':
download("https://max.book118.com/html/2017/1206/143048522.shtm")