-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathCrawlr.py
358 lines (308 loc) · 15.1 KB
/
Crawlr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
import datetime
import webbrowser
from time import sleep
from requests import Session
from robobrowser import RoboBrowser
from lxml import html
from urllib import parse
import pymysql.cursors
import timestring
import json
import requests
from requests.auth import HTTPBasicAuth
import logging
import os.path
import CONFIG
""" headers = {'user-agent': 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'}
proxies = {'https': 'https://88.209.225.150:53281', 'http': 'http://88.209.225.150:53281'} """
file = 'del.html';
session = Session()
useragent = 'Mozilla/5.0 (Linux; Android 4.4; Nexus 5 Build/_BuildID_) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/30.0.0.0 Mobile Safari/537.36'
browser = RoboBrowser(user_agent=useragent, session=session, parser='html.parser',)
# _____FUNCTIONS_______
def convert_si_to_number(x):
total_stars = 0
if 'K' in x:
if len(x) > 1:
total_stars = float(x.replace('K', '')) * 1000 # convert k to a thousand
elif 'k' in x:
if len(x) > 1:
total_stars = float(x.replace('k', '')) * 1000 # convert k to a thousand
elif 'M' in x:
if len(x) > 1:
total_stars = float(x.replace('M', '')) * 1000000 # convert M to a million
elif 'B' in x:
total_stars = float(x.replace('B', '')) * 1000000000 # convert B to a Billion
else: # Less than 1000
try:
total_stars = int(x)
except ValueError: # Catch failture
pexit('Error: convert_si_to_number('+str(x)+')')
return int(total_stars)
def login():
print('Logging in to facebook...')
browser.open("https://facebook.com") # Facebook profile's language need to be EN-US!
login_form = browser.get_form(id='login_form')
login_form['email'].value = CONFIG.FB_EMAIL
login_form['pass'].value = CONFIG.FB_PASSWORD
browser.submit_form(login_form)
if ('<form action="https://m.facebook.com/login' in browser.parsed.encode().decode("utf-8")):
pexit('Login failed')
'''with open(file, "w", encoding="utf-8") as text_file:
print(browser.parsed, file=text_file)
print('Opening browser...')
webbrowser.open_new_tab(file)'''
def pexit(printit=''):
print(printit)
with open(file, "w", encoding="utf-8") as text_file:
print(browser.parsed.encode(), file=text_file)
print('\x1b[7;31;40m' + 'Error, writing del.html...' + '\x1b[0m')
webbrowser.open_new_tab(file)
exit(0)
inserted_count = 0
def getevent(eventid):
global inserted_count
try:
print('\x1b[6;33;40m' + 'Getting event: ' + str(eventid) + '\x1b[0m', end="")
eventurl = "https://mobile.facebook.com/events/"+eventid
browser.open(eventurl)
tree = html.fromstring(browser.parsed.encode())
'''with open(file, "w", encoding="utf-8") as text_file:
print(browser.parsed.encode(), file=text_file)
print('Opening browser...')
webbrowser.open_new_tab(file)
sleep(1)'''
event_description = get_description(tree)
event_ago_location = get_event_ago(tree)
event_going_number = get_going(tree)
event_title = get_title(tree)
event_date_place = get_date_place(tree)
event_photo = get_photo(tree)
'''with open(file, "w", encoding="utf-8") as text_file:
print(browser.parsed.encode(), file=text_file)
print('Opening browser...')
webbrowser.open_new_tab(file)'''
if " dates left" in event_date_place[0]:
print(' - \033[1;31;0mError while getting date:\033[1;0;0m')
print(event_date_place)
dateto = None
datefrom = None
else:
splitted = event_date_place[0].split(' – ', 1)
if len(splitted) < 2:
splitted = event_date_place[0].split(' - ', 1) # – - not equal!!!
if len(splitted) < 2:
print(' - \033[1;31;0mError while splitting date: \033[1;0;0m' + event_date_place[0])
dateto = None
datefrom = None
else:
datefrom = timestring.Date(splitted[0]).date
dateto = timestring.Date(splitted[0][:-4] + splitted[1]).date
'''datefrom = "2018-01-01 01:01:01"
dateto = "2018-01-01 01:01:01"'''
lines = ''
for line in event_description:
lines += line + '<br>'
if (len(event_ago_location) == 2):
event_ago = event_ago_location[0]
event_location = event_ago_location[1]
elif(len(event_ago_location) == 1):
print(' - event_ago is NULL', end='')
event_ago = None
event_location = event_ago_location[0]
else:
event_ago = None
event_location = None
print(' - get_event_ago() --> event_ago_location lenght is '+str(len(event_ago_location)), end='')
if len(event_date_place) != 1:
if len(event_date_place) == 2:
event_place = [event_date_place[1]]
else:
with open(file, "w", encoding="utf-8") as text_file:
print(browser.parsed.encode(), file=text_file)
print('Opening browser...')
webbrowser.open_new_tab(file)
pexit('event_date_place lenght is not 1: '+str(event_date_place))
# Create a new record
now = datetime.datetime.now()
event_date = event_date_place[0]
event_place = event_date_place[1]
event_going = event_going_number[0]
event_interested = event_going_number[1]
lat = '0' # initialize variables
lon = '0'
if CONFIG.GEOCACHE_HOST != '':
if event_location != None: # Get event location geocode
logging.info('\n' + CONFIG.GEOCACHE_HOST + '?address='+event_location + '\n')
response = requests.get(CONFIG.GEOCACHE_HOST + '?address='+event_location, auth=HTTPBasicAuth(CONFIG.GEO_USER, CONFIG.GEO_PASS))
if response.status_code == 200:
jsondata = json.loads(response.content.decode('utf-8'))
lon = jsondata['lon']
lat = jsondata['lat']
else:
logging.warning(' - gps coord response #1 status == ' + str(response.status_code) + ' - ', end='')
if event_location == None or lat == '0.000000' or lat == 'null': # Geocode failed with event_location, try event_place
event_ago = event_location
event_location = None
response = requests.get(CONFIG.GEOCACHE_HOST + '?address='+event_place+', Magyarország', auth=HTTPBasicAuth(CONFIG.GEO_USER, CONFIG.GEO_PASS))
if response.status_code == 200:
jsondata = json.loads(response.content.decode('utf-8'))
lon = jsondata['lon']
lat = jsondata['lat']
else:
logging.warning(' - gps coord is 0: '+event_place)
logging.warning(' - gps coord response #2 status == '+ str(response.status_code) + ' - ', end='')
else:
logging.info('GEOCACHE_HOST is not defined in CONFIG.py, skipping geocoding')
with connection.cursor() as cursor:
# print(eventid, lines, event_date[0], datefrom, dateto, event_place[0], event_ago, event_location, event_going_number[0], event_going_number[1], '0', '34.123', now)
sql = "INSERT INTO events (`id`, `page`, `title`, `description`, `date`, `datefrom`, `dateto`, `place`, `ago`, `location`, `going`, `intrested`, `photo`, `lat`, `lon`, `lastupdate`) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) ON DUPLICATE KEY UPDATE `title`=%s,`page`=%s,`description`=%s,`date`=%s,`datefrom`=%s,`dateto`=%s,`place`=%s,`ago`=%s,`location`=%s,`going`=%s,`intrested`=%s,`photo`=%s,`lat`=%s,`lon`=%s,`lastupdate`=%s;"
cursor.execute(sql, (eventid, pageid, event_title, lines, event_date, datefrom, dateto, event_place, event_ago, event_location, event_going, event_interested, event_photo, lat, lon, now, event_title, pageid, lines, event_date, datefrom, dateto, event_place, event_ago, event_location, event_going, event_interested, event_photo, lat, lon, now))
connection.commit()
inserted_count += 0
print('')
except Exception as e: # Catch failture
print('\nGetevent error')
logging.exception("message")
pexit()
def get_date_place(tree):
event_date_place = tree.xpath('//div[@id="event_summary"]/div/div/table/tbody/tr/td[2]/dt/div/text()') # [0] == date (Friday, November 16, 2018 at 8 PM – 11:55 PM) [1] == place (Expresszó)
if (len(event_date_place) == 0):
pexit('get_date() error. len(event_date_place) == 0')
if (len(event_date_place) == 1):
event_date_place = (event_date_place[0], 'NULL')
print('event_place is NULL')
return event_date_place
def get_event_ago(tree):
event_ago_temp = tree.xpath('//div[@id="event_summary"]/div/div/table/tbody/tr/td[2]/dd/div/text()') # [0] == ago (3 days ago) [1] == location (Brusznyai út 2., Veszprém, 8200)
if (len(event_ago_temp) == 3):
event_ago = [event_ago_temp[0]+event_ago_temp[1],event_ago_temp[2]]
else:
event_ago = event_ago_temp
if (len(event_ago) == 0):
print(' - get_event_ago() == 0')
return event_ago
def get_going(tree):
str = tree.xpath('//div[@id="unit_id_703958566405594"]/div/a/div/text()') # [0] == going (234) [1] == intrested (2.1K)
if (len(str) == 0):
str = tree.xpath('//div[@id="unit_id_703958566405594"]/div/div/div[2]/a/text()')
if (len(str) == 0):
print(' - get_going() --> len(str) == 0', end='')
event_going_number = ['0', '0']
else:
try:
if (str[0] != 'Details') & (str[0] != ''):
splitted = str[0].split(' ')
elif (str[1] != 'Details') & (str[1] != ''):
splitted = str[1].split(' ')
else:
splitted = str[2].split(' ')
if len(splitted) == 1:
if len(str) == 1:
going_str = '0'
interest_str = str[0]
else:
going_str = str[0]
interest_str = str[1]
elif len(splitted) < 3:
print('str: ')
print(str)
print(len(splitted))
pexit('get_goint() error, len(splitted) < 3, == up')
else:
going_str = splitted[0]
interest_str = splitted[3]
except ValueError:
print('str: ')
print(str)
pexit('get_goint() error, printed below')
if (type(going_str) == int) & (type(interest_str) == int):
event_going_number = [going_str, interest_str]
else:
event_going_number = [convert_si_to_number(going_str), convert_si_to_number(interest_str)]
return event_going_number
def get_description(tree):
event_description = tree.xpath('//div[@id="unit_id_886302548152152"]/div[2]/text()')
if (len(event_description) == 0):
event_description = ''
return event_description
def get_title(tree):
event_title = tree.xpath('//h3/text()') # [0] == date (Friday, November 16, 2018 at 8 PM – 11:55 PM) [1] == place (Expresszó)
if (len(event_title) == 0):
print(event_title)
pexit(' - get_title() error. len(event_title) == 0')
return event_title[0]
def get_photo(tree):
src = tree.xpath('//div[@id="event_header"]/a/img/@src') # src=https://scontent.fbud3-1.fna.fbcdn.net/v/t1.0-9/cp0/e15/q65/c40.0.1119.628/46168765_2003771449645939_8214378811936997376_o.jpg?_nc_cat=1&efg=eyJpIjoiYiJ9&_nc_ht=scontent.fbud3-1.fna&oh=b3bba07c88fd8710a656964bbb322937&oe=5C6BCBEA
if (len(src) == 0):
src = tree.xpath('//a[@aria-label="Watch video"]/div/img/@src') # get video preview image
if (len(src) == 0):
print(' - get_photo() error. len(src) == 0', end='')
return ''
return src[0]
def getpage(page):
try:
print('\x1b[6;32;40m' + 'Getting page: '+page + '\x1b[0m')
eventurl = "https://mobile.facebook.com/"+page+"?v=events"
browser.open(eventurl)
tree = html.fromstring(browser.parsed.encode())
strings = tree.xpath('//div[@id="root"]/div/div/div[2]/div/table/tbody/tr/td/div/div/span[3]/div/a[1]/@href')
eventids = []
for string in strings:
eventids.append(os.path.split(string)[1].split('?')[0])
istheremore = tree.xpath('//div[@id="m_more_friends_who_like_this"]/a/span/text()')
while istheremore:
nexturl = tree.xpath('//div[@id="m_more_friends_who_like_this"]/a/@href')[0]
browser.open('https://mobile.facebook.com'+nexturl)
tree = html.fromstring(browser.parsed.encode())
strings = tree.xpath('//div[@id="root"]/div/div/div[2]/div/table/tbody/tr/td/div/div/span[3]/div/a[1]/@href')
for string in strings:
eventids.append(os.path.split(string)[1].split('?')[0])
istheremore = tree.xpath('//div[@id="m_more_friends_who_like_this"]/a/span/text()')
try:
now = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
mycursor = connection.cursor()
sql = "UPDATE pages SET lastindex = %s WHERE page = %s"
mycursor.execute(sql, (now, page))
connection.commit()
except:
print('Lastindex update error' + str(now))
return eventids
except ValueError: # Catch failture
pexit('Getpage error')
def listpages():
mycursor = connection.cursor()
nowplushour = datetime.datetime.now() + datetime.timedelta(hours=-5)
mycursor.execute("SELECT page FROM pages WHERE lastindex < %s OR lastindex IS NULL", nowplushour)
pages = []
for page in mycursor:
pages.append(page['page'])
return pages
# _______START SCRIPT________
login()
connection = pymysql.connect(host = CONFIG.MYSQL_HOST,
user = CONFIG.MYSQL_USER,
password = CONFIG.MYSQL_PASS,
db = CONFIG.MYSQL_DB,
cursorclass=pymysql.cursors.DictCursor)
listpages = listpages()
for pageid in listpages:
pageevents = getpage(pageid)
for eventid in pageevents:
if (eventid != '2097615280296927') & (eventid != '980689918806985') & (eventid != '773389126335451'):
getevent(eventid)
else:
print('Banned event: '+eventid)
nowminday = datetime.datetime.now() + datetime.timedelta(days=-1)
with connection.cursor() as cursor:
cursor.execute("DELETE from events WHERE datefrom < %s AND dateto < %s", (nowminday, nowminday))
result = cursor.rowcount
print(str(result) + " old row deleted")
connection.commit()
connection.close()
now = datetime.datetime.now()
if (inserted_count > 0):
print(inserted_count + ' new row inserted')
else:
print('Pages are already updated less than an hour ago, no new events queried')
print('Script end at '+str(now.hour)+':'+str(now.minute))