-
Notifications
You must be signed in to change notification settings - Fork 6
/
crawler_gunijan.py
130 lines (109 loc) · 4.68 KB
/
crawler_gunijan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import time
import os
import re
import pickle
import json
import copy
import shutil
import constants
import traceback
import hashlib
from urllib.parse import unquote
from tqdm import tqdm
from bs4 import BeautifulSoup
from crawler_base import CrawlerBase
from sessions import TouchVPNSession
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
class CrawlerGunijan(CrawlerBase):
def __init__(self):
super(CrawlerGunijan, self).__init__('http://gunijan.org.bd',
out_dir='gunijan',
session_class=TouchVPNSession,
**constants.TouchVPNSessionDefaults)
self.blackList = set(['http://gunijan.org.bd/search?q='])
def load_full_page(self, session, patience=1):
scrolling_js = '''window.scrollTo(0, document.body.scrollHeight);'''
height_js = '''var l=document.body.scrollHeight; return l;'''
prev_height = -1
current_height = session.session.execute_script(height_js)
current = 0
while True:
session.session.execute_script(scrolling_js)
time.sleep(1)
modal_button = EC.visibility_of_element_located(
(By.XPATH, "//div[@id='webPushModal']//button[@class='close']")
)
try:
if modal_button(session.session):
session.session.find_element_by_xpath("//div[@id='webPushModal']//button[@class='close']").click()
except:
pass
prev_height = current_height
current_height = session.session.execute_script(height_js)
if prev_height == current_height:
current += 1
if current >= patience:
break
else:
current = 0
def parse_html(self, session):
links = set()
content = ''
output_fname = ''
article_page = False
if session.session.current_url.startswith('http://gunijan.org.bd') and len(session.session.current_url.split("/")) == 4:
article_page = True
patience = 1
else:
patience = 2
self.load_full_page(session, patience)
soup = BeautifulSoup(session.session.page_source, 'html.parser')
if session.session.current_url.startswith('http://gunijan.org.bd'):
links.add(session.session.current_url)
for link in soup.find_all('a', href=True):
extension = link['href']
if extension.startswith('/'):
actual_link = 'http://gunijan.org.bd' + extension
elif extension.startswith('http://gunijan.org.bd'):
actual_link = extension
else:
actual_link = ''
if actual_link and actual_link not in self.blackList:
links.add(actual_link)
else:
links.add('http://gunijan.org.bd')
if article_page:
soup = soup.find("div",{"class":"container"})
if soup:
title_element = soup.find("div",{"class":"entry-title"})
if title_element:
title_text = title_element.get_text().strip()
else:
title_text = ''
content_element = soup.find("div",{"class":"entry-body"})
if content_element:
content_text = content_element.get_text().strip()
else:
content_text = ''
if title_text and content_text:
content = f'''
<article>
<title>{title_text}</title>
<text>
{content_text}
</text>
</article>
'''
encoded_link = unquote(session.session.current_url).encode('utf-8', errors='ignore')
h = hashlib.sha1()
h.update(encoded_link)
output_fname = h.hexdigest()
return links, content, output_fname
if __name__ == "__main__":
crawler = CrawlerGunijan()
crawler.run()