-
Notifications
You must be signed in to change notification settings - Fork 0
/
tasks.py
67 lines (62 loc) · 2.24 KB
/
tasks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from celery import Celery
import sys
import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
today = datetime.today()
from db_link import Post
from time import sleep
app = Celery()
RUBRICS = ['economy', 'society', 'atomtec', 'teplo', 'space', 'science', 'religion', 'ecology_news', 'mediawars']
@app.task()
def parse_archive():
date = today.strftime('%Y%m%d')
url = 'https://ria.ru/archive/{}/'.format(date)
r = requests.get(url)
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html.parser')
for each_div in soup.findAll("div", {"class": "b-list__item"}):
for item in each_div.find_all('a', href=True):
url = 'https://ria.ru{}'.format(item['href'])
url = url.replace('https://ria.ruhttps://','https://')
title = item.text
img = item.img['src']
post = Post(
title=title,
url=url,
img=img
)
if len(Post.objects(title=title)) == 0:
print('save')
post.save() # This will perform an insert
@app.task()
def check_metrics():
for post in Post.objects():
if post.published > (datetime.today() - timedelta(days=1)):
sleep(2)
post.url = post.url.replace('https://ria.ruhttps://','https://')
if post.url.split('/')[3] not in RUBRICS:
continue
r = requests.get(post.url)
html_doc = r.text
soup = BeautifulSoup(html_doc, 'html.parser')
arr_stat = []
for each_div in soup.findAll("div", {"class": "b-article__info-statistic"}):
for item in each_div.find_all('span', {'class' : 'b-statistic__number'}):
arr_stat.append(item.text)
arr_metrics = ['comments', 'views', 'likes', 'dislikes']
dict_metrics = {}
for i,j in zip(arr_metrics,arr_stat):
dict_metrics[i] = j
post.update(set__metrics__=dict_metrics)
print('Metrics ok')
app.conf.beat_schedule = {
'planner': {
'task': 'tasks.parse_archive',
'schedule': 60.0,
},
'planner1': {
'task': 'tasks.check_metrics',
'schedule': 1800.0,
},
}