-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawling_post.py
45 lines (35 loc) · 1.65 KB
/
crawling_post.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import requests
from bs4 import BeautifulSoup
def deco_parsing(function):
def parsing(urls):
data = requests.get(urls[-1])
html = data.text
soup = BeautifulSoup(html, 'html.parser')
return function(soup,urls)
return parsing
@deco_parsing
def extract_post_title(soup,urls):
today_contents = '해당 날짜에 공부하여 올린 포스팅입니다.\n 더욱 자세한 내용은 <a href="https://ffoorreeuunn.tistory.com/">🔗여기</a>에서 확인하실 수 있습니다.\n\n'
today_posts = soup.select(".item_category")
url_prefix = "https://ffoorreeuunn.tistory.com"
# 포스팅은 항상 최신순으로 정렬되어 있기때문에 조건문에 맞지 않는것이 생기면 바로 반복 탈출.
for today_post in today_posts:
post_date = today_post.select(".date")[0].text
if ":" in post_date:
post_title = today_post.select("strong")[0].text
post_image = today_post.select(".item-thumbnail")[0]['style']
post_image = post_image[22:-2]
print(post_image)
url_suffix = today_post.select("a")[0].attrs['href']
url = url_prefix + url_suffix
urls.append(url)
tags = extract_post_tag(urls)
today_contents += f"<br><br><br><br><br><a href={url}><h2>👏{post_title}</h2></a><br><br>![image]({post_image})<br><br>"
for tag in tags:
today_contents += f"#{tag} "
else : break
return today_contents
@deco_parsing
def extract_post_tag(soup,urls):
tags = soup.select(".tag_content")[0].text.split(',\r\n')
return tags