-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_wiki_spider.py
54 lines (46 loc) · 2.34 KB
/
run_wiki_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import argparse
from scrapy.crawler import CrawlerProcess
from spiders.wiki_episode_table_spider import WikiEpisodeTableSpider
def run_wiki_spider(args):
"""Define and start process for Wikipedia scraping."""
# overwrite output
with open(args.output_path, 'w') as f:
pass
# run spider
process = CrawlerProcess(settings={
'FEED_FORMAT': 'json',
'FEED_URI': args.output_path,
'ROBOTSTXT_OBEY': True,
'DEPTH_LIMIT': 2
})
process.crawl(
WikiEpisodeTableSpider, start_url=args.start_url, allow=args.url_substring, title_keywords=args.title_keywords
)
process.start()
def get_arguments():
"""Parse command line arguments."""
parser = argparse.ArgumentParser(
description='Wikipedia episode summary spider.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument('-s', '--start_url', type=str, required=True,
help='start URL for the spider.'
'Should be: '
'https://en.wikipedia.org/wiki/<Show_Title_With_Underscores_And_Capitalized_Words>. '
'Example: https://en.wikipedia.org/wiki/Star_Trek')
parser.add_argument('-u', '--url_substring', type=str, required=True,
help='Wikipedia urls must include this substring otherwise the spider will not enter the URL.'
'Ideally, it should be something like: '
'<Show_Title_With_Underscores_And_Capitalized_Words>. Example: "Star_Trek"')
parser.add_argument('-t', '--title_keywords', nargs='*', required=True,
help='The title of the Wikipedia page must include these keywords, '
'otherwise the spider will not extract anything from the page. '
'Good practice: use the lowercase version of the words from the title of the show. '
'Example: star trek')
parser.add_argument('-o', '--output_path', type=str, required=False, default='wiki_episode_summaries.json',
help='Path to the output JSON file. If the file already exists, it will be overwritten.')
args = parser.parse_args()
return args
if __name__ == '__main__':
args = get_arguments()
run_wiki_spider(args)