-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathweibo_search_selenium_hour.py
207 lines (196 loc) · 7.34 KB
/
weibo_search_selenium_hour.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import pandas
import time
import datetime
import re
import random
import logging
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('window-size=1200,1100')
driver = webdriver.Chrome(chrome_options=chrome_options,executable_path='D:/chromedriver/chromedriver.exe')
# driver = webdriver.Chrome('D:/chromedriver/chromedriver.exe')
df = pandas.DataFrame()
# driver.maximize_window()
# 登录
def LoginWeibo(username, password):
try:
driver.get('http://www.weibo.com/login.php')
time.sleep(5)
driver.find_element_by_xpath('//input[@id="loginname"]').clear()
driver.find_element_by_xpath('//input[@id="loginname"]').send_keys(username)
time.sleep(3)
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[2]/div/input').send_keys(password)
driver.find_element_by_xpath('//*[@id="login_form_savestate"]').click()
time.sleep(1)
driver.find_element_by_xpath('//*[@id="pl_login_form"]/div/div[3]/div[6]/a').click()
except Exception:
logger.error('Something wrong with', exc_info=True)
# 搜索,如果大于一天,分天搜索
def GetSearchContent(key):
driver.get("http://s.weibo.com/")
logger.info('搜索热点主题:%s' % key)
driver.find_element_by_xpath("//input").send_keys(key)
time.sleep(3)
driver.find_element_by_xpath('//button').click()
current_url = driver.current_url.split('&')[0]
start_date = datetime.datetime(2018,10,18,0)
end_date = datetime.datetime(2018,10,19,0)
delta_date = datetime.timedelta(hours=1)
start_stamp = start_date
end_stamp = start_date + delta_date
while end_stamp <= end_date:
url = current_url + '&typeall=1&suball=1×cope=custom:' + str(start_stamp.strftime("%Y-%m-%d")) + ':' + str(end_stamp.strftime("%Y-%m-%d")) + '&Refer=g'
time.sleep(random.randint(5,10))
driver.get(url)
handlePage()
start_stamp = end_stamp
end_stamp = start_stamp + delta_date
# 处理页面,检查是否有内容,有内容进行爬取
def handlePage():
page = 1
while True:
time.sleep(random.randint(5,10))
if checkContent():
logger.info('页数:%s' % page)
getContent()
page += 1
if checkNext():
driver.find_element_by_xpath('//div[@class="m-page"]/div/a[@class="next"]').click()
else:
logger.info("no Next")
break
else:
logger.info("no Content")
break
# 检查页面是否有内容
def checkContent():
try:
driver.find_element_by_xpath("//div[@class='card card-no-result s-pt20b40']")
flag = False
except:
flag = True
return flag
# 检查是否有下一页
def checkNext():
try:
driver.find_element_by_xpath('//div[@class="m-page"]/div/a[@class="next"]')
flag = True
except:
flag = False
return flag
# 处理时间
def get_datetime(s):
try:
today = datetime.datetime.today()
if '今天' in s:
H, M = re.findall(r'\d+',s)
date = datetime.datetime(today.year, today.month, today.day, int(H), int(M)).strftime('%Y-%m-%d %H:%M')
elif '年' in s:
y, m, d, H, M = re.findall(r'\d+',s)
date = datetime.datetime(int(y), int(m), int(d), int(H), int(M)).strftime('%Y-%m-%d %H:%M')
else:
m, d, H, M = re.findall(r'\d+',s)
date = datetime.datetime(today.year, int(m), int(d), int(H), int(M)).strftime('%Y-%m-%d %H:%M')
except:
date = s
return date
# 获取内容
def getContent():
nodes = driver.find_elements_by_xpath('//div[@class="card-wrap"][@action-type="feed_list_item"][@mid]')
if len(nodes) == 0:
time.sleep(random.randint(20,30))
driver.get(driver.current_url)
getContent()
results = []
global df
logger.info('微博数量:%s' % len(nodes))
for i in range(len(nodes)):
blog = {}
try:
BZNC = nodes[i].find_element_by_xpath('.//a[@class="name"]').get_attribute('nick-name')
except:
BZNC = ''
blog['博主昵称'] = BZNC
try:
BZZY = nodes[i].find_element_by_xpath('.//a[@class="name"]').get_attribute("href")
except:
BZZY = ''
blog['博主主页'] = BZZY
try:
WBNR = nodes[i].find_element_by_xpath('.//p[@class="txt"][@node-type="feed_list_content"]').text
if len(nodes[i].find_elements_by_xpath('.//p[@class="txt"][@node-type="feed_list_content"]'))>1:
WBNR = WBNR + '\n转发:' +nodes[i].find_element_by_xpath('.//div[@node-type="feed_list_forwardContent"]').text
except:
WBNR = ''
blog['微博内容'] = WBNR
try:
FBSJ = nodes[i].find_element_by_xpath('.//div[@class="content"]/p[@class="from"]/a[1]').text
except:
FBSJ = ''
blog['发布时间'] = get_datetime(FBSJ)
try:
WBDZ = nodes[i].find_element_by_xpath('.//div[@class="content"]/p[@class="from"]/a[1]').get_attribute("href")
except:
WBDZ = ''
blog['微博地址'] = WBDZ
try:
WBLY = nodes[i].find_element_by_xpath('.//div[@class="content"]/p[@class="from"]/a[2]').text
except:
WBLY = ''
blog['微博来源'] = WBLY
try:
ZF_TEXT = nodes[i].find_element_by_xpath('.//div[@class="card-act"]/ul/li[2]').text.replace('转发','').strip()
if ZF_TEXT == '':
ZF = 0
else:
ZF = int(ZF_TEXT)
except:
ZF = 0
blog['转发'] = ZF
try:
PL_TEXT = nodes[i].find_element_by_xpath('.//div[@class="card-act"]/ul/li[3]').text.replace('评论','').strip()
if PL_TEXT == '':
PL = 0
else:
PL = int(PL_TEXT)
except:
PL = 0
blog['评论'] = PL
try:
ZAN_TEXT = nodes[i].find_element_by_xpath('.//div[@class="card-act"]/ul/li[4]/a/em').text
if ZAN_TEXT == '':
ZAN = 0
else:
ZAN = int(ZAN_TEXT)
except:
ZAN = 0
blog['赞'] = ZAN
results.append(blog)
df = df.append(results)
df.to_excel('C:/Users/Administrator/Desktop/results.xlsx',index=0)
logger.info('已导出微博条数:%s' % len(df))
if __name__ == '__main__':
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
handler = logging.FileHandler('export_record.log')
handler.setLevel(logging.INFO)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
console.setFormatter(formatter)
logger.addHandler(handler)
logger.addHandler(console)
logger.info('*'*30+'START'+'*'*30)
username = '******' # 填写用户名
password = '******' # 填写密码
LoginWeibo(username, password)
key = 'python' # 填写搜索关键词
GetSearchContent(key)
time.sleep(10)
driver.quit()
logger.info('*'*30+'E N D'+'*'*30)