-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreddit_scrape.py
80 lines (62 loc) · 2.35 KB
/
reddit_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import requests
import csv
import time
from bs4 import BeautifulSoup
url="https://old.reddit.com/r/SuicideWatch/"
headers = {'User-Agent': 'Mozilla/5.0'}
page = requests.get(url,headers=headers)
soup=BeautifulSoup(page.text, 'html.parser')
# domains = soup.find_all("span", class_="domain")
# soup.find_all("span", {"class": "domain", "height": "100px"})
# for domain in domains:
# if domain != "(self.SuicideWatch)":
# continue
# print(domain.text)
# for domain in soup.find_all("span", class_="domain"):
# if domain.text != "(self.SuicideWatch)":
# #print(domain.text)
# continue
# parent_div = domain.parent
# print(parent_div.text)
attrs = {'class': 'thing', 'data-domain': 'self.SuicideWatch'}
counter = 1
words=[]
try:
for i in tqdm(range(100)):
for post in soup.find_all('div', attrs=attrs):
f= open("titles.txt","a+")
f2= open("posts.txt","a+")
link = post.find('a', class_="title").get('href')
#print(link)
title = post.find('a', class_="title").text
f.write("# "+title+"\n")
p1 = requests.get("https://old.reddit.com"+link,headers=headers)
s1=BeautifulSoup(p1.text, 'html.parser')
if s1.find('div', attrs=attrs).find('div', class_="usertext-body") != None:
post=s1.find('div', attrs=attrs).find('div', class_="usertext-body").text
f2.write("# "+title+"\n")
f2.write("@ "+post+"\n")
else:
f2.write("# "+title+"\n")
words.extend(list(title.split(" ")))
f.close()
f2.close()
#print(title)
#print(post)
#print(next_page_link[:-16])
if soup.find("span", class_="next-button") != None:
next_button = soup.find("span", class_="next-button")
next_page_link = next_button.find("a").attrs['href']
else:
#print("ASSSSSS")
page = requests.get(next_page_link[:-16], headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
next_button = soup.find("span", class_="next-button")
next_page_link = next_button.find("a").attrs['href']
time.sleep(2)
page = requests.get(next_page_link, headers=headers)
soup = BeautifulSoup(page.text, 'html.parser')
except AttributeError:
print(page.text)
print(soup)
print(words)