-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatacamp_instructors_2.py
53 lines (39 loc) · 1.89 KB
/
datacamp_instructors_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import scrapy
import urllib.request
from csv import writer
from os import getcwd, path, mkdir
# Import the CrawlerProcess: for running the spider
from scrapy.crawler import CrawlerProcess
def img_downloader(image_url, filename):
current_dir = getcwd()
instructors_dir = path.join(current_dir, 'instructors_images')
img_name = str(filename)
if not path.exists(instructors_dir):
mkdir(path.join(instructors_dir))
urllib.request.urlretrieve(image_url, path.join(instructors_dir, img_name))
class DC_instructor_spider(scrapy.Spider):
name = "datacamp_instructor"
start_urls = ['https://www.datacamp.com/instructors?all=true']
output = 'instructors_datacamp.csv'
# def start_requests(self):
# urls = ['https://www.datacamp.com/instructors?all=true']
# for url in urls:
# yield scrapy.Request(url=url, callback=self.parse_front)
def parse(self, response):
instructor_links = response.css('div.instructor-block__description a.instructor-block__link::attr(href)')
link_to_follow = instructor_links.extract()
for url in link_to_follow:
yield scrapy.Request(url="https://www.datacamp.com" + url, callback=self.parse_front)
def parse_front(self, response):
instructor_div = response.css('div.css-fe452h')
name = instructor_div.css('h1::text').extract_first().strip()
role = instructor_div.css('h2::text').extract_first().strip()
bio = instructor_div.css('p::text').extract_first().strip()
img_url = instructor_div.css('img::attr(src)').extract_first().strip()
with open(self.output, 'a', newline="") as fhand:
writer_csv = writer(fhand)
writer_csv.writerow([name, role, bio])
yield {'Name': name, 'role': role, 'bio':bio, 'img_url': img_url}
img_downloader(img_url, name)
# instr_dict=dict()
#Run the spider