-
Notifications
You must be signed in to change notification settings - Fork 2
/
train_crawler.py
203 lines (174 loc) · 7.68 KB
/
train_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/6/29 0029 18:08
# @Author : KangQiang
# @File : Crawler.py
# @Software: PyCharm
import json
import threading
import time
from queue import Queue
from config import UN_PROCESSED, PROCESSING, PROCESSED, logger, train_tasks_status_db, train_result_filename, format_url,ip_interval
from pymongo import UpdateOne
from get_page import get_page,get_count
task_queue = Queue()
response_queue = Queue()
def parse(html_response):
'''
将 get_page 得到的 response.text 解析,得到其中有用的数据
:param html_response:
:return: list。始发站到终点站所有车次信息。
[['5l0000D35273', 'D352', 'AOH', 'ICW', 'AOH', 'ICW', '06:11', '20:27', '14:16', 'Y'], ['5l0000D63640', 'D636', 'AOH', 'ICW', 'AOH', 'ICW', '06:33', '21:01', '14:28', 'Y']]
'''
time_detail = []
data = json.loads(html_response)["data"]
result = data["result"]
try:
for train in result:
temp_list = train[train.index("|") + 1:].split("|")
time_detail.append(temp_list[1:11])
except Exception as e:
logger.critical(str(e) + str(result))
return None
return time_detail
def construct_url(url_paras):
'''
根据url_paras构造页面网址
:param url_paras: 字符串。以'-'分割不同参数
:return: 网址。需要注意的是,网址中有train_date参数。train_date:选择程序运行时刻的后5天。
'''
return format_url.format(from_station=url_paras.split("-")[0], to_station=url_paras.split("-")[1])
class Crawler(threading.Thread):
'''
数据抓取类
'''
def __init__(self, task_status_db, parse_fun, construct_url_fun):
'''
:param collection: 数据库。任务相关数据库,记录了任务抓取状态。以url_para作为primary key
:param parse_fun: 函数。解析response.text,返回有用数据。
:param task_url_fun: 函数。利用task_queue中的数据,得到task_url。
'''
super(Crawler, self).__init__()
self.collection = task_status_db
self.parse = parse_fun
self.construct_url = construct_url_fun
def run(self):
global task_queue
global response_queue
while True:
url_paras = task_queue.get()
task_url = self.construct_url(url_paras)
try:
# 这里的 requests需要包装起来
response = get_page(task_url)
if response:
data = self.parse(response)
if data is not None: # 如果为None,表明解析response.text出现错误。
response_queue.put((data, url_paras))
else:
self.collection.update_one({'_id': url_paras}, update={'$set': {'status': UN_PROCESSED}})
else:
# 把数据库中的 status重新置为 UN_PROCESSED
self.collection.update_one({'_id': url_paras}, update={'$set': {'status': UN_PROCESSED}})
except Exception as e:
logger.critical('In Crawler:{}'.format(str(e)) + str(task_url))
self.collection.update_one({'_id': url_paras}, update={'$set': {'status': UN_PROCESSED}})
pass
class TaskProducer(threading.Thread):
def __init__(self, task_status_db):
super(TaskProducer, self).__init__()
self.collection = task_status_db
def run(self):
global task_queue
while True:
try:
if task_queue.qsize() < 300:
temp = self.collection.find({'status': UN_PROCESSED}, limit=60)
for single_item in temp:
# 设置为PROCESSING
self.collection.update_one({'_id': single_item['_id']},
update={'$set': {'status': PROCESSING}})
task_queue.put(single_item['_id'])
else:
time.sleep(3)
except Exception as e:
logger.critical('In Task1Producer:{}'.format(str(e)))
pass
class DataSaver3(threading.Thread):
def __init__(self, task_status_db, file_name):
'''
:param task_status_db: 记录任务状态的数据库。数据直接存入,task_status_db中
:param file_name: 数据存储的文件名。
'''
super(DataSaver3, self).__init__()
self.status_db = task_status_db # 既是存储任务状态的数据库,也是存储数据的地方。
self.file_name = file_name
def save2db(self, size):
'''
:param size: response_size
:return:
'''
global response_queue
ops = [] # users_info需要执行的运算操作
try:
for _ in range(size):
data, url_paras = response_queue.get() # 一个界面的response
ops.append(
UpdateOne({'_id': url_paras}, update={'$set': {'status': PROCESSED, 'data': data}}))
# self.status_db.update_one({'_id': recordId}, {'$set': {'status': PROCESSED, 'data': data}})
if ops:
self.status_db.bulk_write(ops, ordered=False)
except Exception as e:
if 'batch op errors occurred' not in str(e):
logger.error('In save2db:' + str(e))
pass
def save2file(self, size):
'''
将size大小的response解析之后存入本地文件
:param size: response_size
:return:
'''
global response_queue
with open(self.file_name, 'a', encoding="utf-8") as f:
for _ in range(size):
try:
data, url_para = response_queue.get()
self.status_db.find_one_and_update({'_id': url_para},
{'$set': {'status': PROCESSED}}) # 将相应的uid置为PROCESSED
f.write(json.dumps({"_id": url_para, "data": data}) + '\n')
except Exception as e:
logger.error('In save2file:' + str(e))
pass
def run(self):
while True:
self.save2db(30)
# self.save2file(200)
class Supervisor(threading.Thread):
def __init__(self, tasks_status_db):
super(Supervisor, self).__init__()
self.tasks_status_db = tasks_status_db
def run(self):
global response_queue
while True:
pre_count = get_count()
time.sleep(10)
now_count = get_count()
logger.info('page_count:{now_count} speed:{speed} response_queue.qsize():{size}'.format(now_count=now_count,
speed=(
now_count - pre_count) / 10,
size=response_queue.qsize()))
print("PROCESSED: " + str(self.tasks_status_db.find({"status": PROCESSED}).count()))
print("PROCESSING: " + str(self.tasks_status_db.find({"status": PROCESSING}).count()))
print("UN_PROCESSED: " + str(self.tasks_status_db.find({"status": UN_PROCESSED}).count()))
if __name__ == '__main__':
t = TaskProducer(train_tasks_status_db)
t.start()
t = DataSaver3(train_tasks_status_db, train_result_filename)
t.start()
for i in range(2):
t = Crawler(train_tasks_status_db, parse, construct_url)
t.start()
# t = IpChanger()
# t.start()
t = Supervisor(train_tasks_status_db)
t.start()