-
Notifications
You must be signed in to change notification settings - Fork 2
/
get_page.py
157 lines (134 loc) · 4.43 KB
/
get_page.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2017/10/31 0031 15:22
# @Author : KangQiang
# @File : get_page.py
# @Software: PyCharm
# 返回网页的文本
import random
import threading
import time
import redis
import requests
from config import logger, REDIS_URL, headers, test_ip_url, ip_interval
# 可实现不停切换Ip
timeout = 15 # 访问网页的超时时间
changing_ip_flag = True
correct_page_count = 0
# 改变ip,并返回
def change_get_ip():
basic_redis = redis.Redis.from_url(REDIS_URL, db=1, decode_responses=True)
pre_ip = basic_redis.get('ip')
basic_redis.set('change_ip', 1)
while pre_ip == basic_redis.get('ip'): # Wait until adsl server get a new ip successfully.
time.sleep(1)
return basic_redis.get('ip')
# 验证ip,并返回一个有效的ip
def get_valid_ip():
temp_ip = ''
for _ in range(6):
temp_ip = change_get_ip() # adsl已经成功切换ip,需要在爬虫服务器上验证该ip是否可用
try:
requests.get(test_ip_url, headers=headers, timeout=7, proxies={'https': 'https://' + temp_ip + ':3128'})
except Exception as e:
if 'Caused by ConnectTimeoutError' in str(e) or 'Max retries exceeded' in str(e):
logger.info('Invalid ip:{}'.format(temp_ip))
continue
else:
logger.info('Error In get_valid_ip:{}'.format(str(e)))
continue
else:
logger.info('Valid ip:{}'.format(str(temp_ip)))
break
return temp_ip
# 封装改变ip的函数,便于IpChanger()使用
def change_ip():
global ip
ip = get_valid_ip()
def ip_flag(value):
global changing_ip_flag
changing_ip_flag = value
def get_count():
global correct_page_count
return correct_page_count
def get_ip():
global ip
return ip
def get_page(url):
'''
:param url:
:return:
'''
global ip
global changing_ip_flag
global correct_page_count
try:
if changing_ip_flag:
# 表示正在切换ip
time.sleep(random.randint(1, 5))
response = requests.get(url, headers=headers, timeout=timeout)
else:
response = requests.get(url, headers=headers, timeout=timeout, proxies={'https': 'https://' + ip + ':3128'})
if response.status_code == 200:
correct_page_count += 1
return response.text
else:
return None
except Exception as e:
if 'Cannot connect to proxy' in str(e):
return None
elif 'Caused by ConnectTimeoutError' in str(e):
return None
elif 'read timeout' in str(e) or 'Read timed out' in str(e):
return None
else:
logger.error("In get page exception: ".format(str(e)))
return None
def post_page(url, data):
'''
:param url: 地址
:param data: post的数据
:return:
'''
global ip
global changing_ip_flag
global correct_page_count
try:
if changing_ip_flag:
# 表示正在切换ip
time.sleep(random.randint(1, 4))
response = requests.post(url, headers=headers, timeout=timeout, data=data)
else:
response = requests.post(url, headers=headers, timeout=timeout, data=data,
proxies={'https': 'https://' + ip + ':3128'})
if response.status_code == 200:
correct_page_count += 1
return response.text
else:
return None
except Exception as e:
if 'Cannot connect to proxy' in str(e):
return None
elif 'Caused by ConnectTimeoutError' in str(e):
return None
elif 'read timeout' in str(e) or 'Read timed out' in str(e):
return None
else:
logger.error("In get page exception: ".format(str(e)))
return None
# 每隔ip_interval切换一次ip
class IpChanger(threading.Thread):
def __init__(self):
super(IpChanger, self).__init__()
def run(self):
while True:
time.sleep(ip_interval)
logger.info('Change Ip')
ip_flag(True) # 在改变ip的过程中将flag置为True,让爬虫不使用代理访问网页
change_ip()
ip_flag(False) # 得到有效的ip后,将flag置为False,让爬虫重新使用代理访问网页
#
# change_ip()
# logger.info('Initial ip:{}'.format(ip))
# t = IpChanger()
# t.start()