-
Notifications
You must be signed in to change notification settings - Fork 0
/
aws-blog-parser.py
118 lines (104 loc) · 4.34 KB
/
aws-blog-parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import sys
import json
import argparse
# Global Variables
siteContent = [] #Array containing the HTML of each blogs paginated blog post titles and previews
blogPosts = [] #Array containing the URLs for each blog post
output = {} #JSON document containing all posts, author, data published, and tags
def simple_get(url):
"""
Attempts to get the content at `url` by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
It is always a good idea to log errors.
This function just prints them, but you can
make it do anything.
"""
print(e)
# Attempt to get the Blog URL to parse
try:
parser = argparse.ArgumentParser(description='Extract contents of an AWS blog. As of July 12th, 2018.')
parser.add_argument('-f', action='store_true', help='Output each blog post to a separate .json file.')
parser.add_argument('blogurl', help='URL of AWS blog. Ex: http://aws.amazon.com/blogs/database/')
args = vars(parser.parse_args())
print(args)
newBlog = args['blogurl']
siteContent.append(str(simple_get(newBlog)))
except IndexError:
parser.print_help()
exit()
# Starting with the second page of Older Post, pull the contents of each subsequent page into an array
pageNumber = 2
siteURL = newBlog + 'page/' + str(pageNumber) + '/'
nextSiteContent = simple_get(siteURL)
while(nextSiteContent != 'None'):
siteContent.append(nextSiteContent)
pageNumber += 1
siteURL = newBlog + 'page/' + str(pageNumber) + '/'
nextSiteContent = str(simple_get(siteURL))
print("Number of pages found: " + str(len(siteContent)))
# Take each page of the blog contents and parse out the URL for each separate blog post
for page in siteContent:
html = BeautifulSoup(page, 'html.parser')
Urls = html.select('h2[class="blog-post-title"] a[href]')
for url in Urls:
blogPosts.append(url.get('href'))
print("Number of blog posts found: " + str(len(blogPosts)))
# Using the URLs for each of the posts - contained in blogPosts[] - collect the HTML for the post site
# then parse the contents. Return Author, Date, Tags, and Post Contents in a JSON.
output['posts'] = [] #declare a new array of posts within the output JSON document.
for post in blogPosts:
print("Processing post at: " + post)
postHtml = BeautifulSoup(simple_get(post), 'html.parser')
Authors = postHtml.select('span[property="author"]')
Title = postHtml.select('h1[property="name headline"]')
DatePublished = postHtml.select('time[property="datePublished"]')
Categories = postHtml.select('span[property="articleSection"]')
postContent = postHtml.select('section[property="articleBody"]')
tagArray = []
authorArray = []
for tag in Categories:
tagArray.append(tag.text)
for auth in Authors:
authorArray.append(auth.text)
postJson = {}
postJson["url"] = post
postJson["title"] = Title[0].text
postJson["authors"] = authorArray
postJson["date"] = DatePublished[0].text
postJson["tags"] = tagArray
postJson["post"] = postContent[0].text
# If the -f flag is set when the script is launched via command line, then output each file to a .json file
if('f' in args.keys()):
outputfile = open(Title[0].text.replace("/", "_") + '.json','w')
json.dump(postJson,outputfile)
outputfile.close()
output["posts"].append(postJson)
#If the -f flag is not set, then output all of the contents of the blog and every post to a single JSON to STDOUT.
if('f' not in args.keys()):
print(json.dumps(output))
print("Processing Completed!")