-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathscraper.rb
104 lines (90 loc) · 2.88 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
require 'rubygems'
require 'scraperwiki'
require 'httparty'
require 'open-uri'
require 'json'
require 'i18n'
# Scrapable classes
module RestfulApiMethods
def format info
info
end
def put record
end
def post record
end
end
class PeopleStorage
include RestfulApiMethods
def save record
post record
end
def post record
if ((ScraperWiki.select("* from data where `uid`='#{record['uid']}'").empty?) rescue true)
# Convert the array record['organizations'] to a string (by converting to json)
if record['organizations'].is_a? Array
record['organizations'] = JSON.dump(record['organizations'])
end
ScraperWiki.save_sqlite(['uid'], record)
puts "Adds new record " + record['uid']
else
puts "Skipping already saved record " + record['uid']
end
end
end
# The real thing
class CongressmenProfiles < PeopleStorage
def initialize()
super()
@location = 'http://pmocl.popit.mysociety.org/api/v0.1/persons/?per_page=200'
@location_organizations = 'http://pmocl.popit.mysociety.org/api/v0.1/organizations/'
end
def process
response = HTTParty.get(@location, :content_type => :json)
response = JSON.parse(response.body)
popit_congressmen = response['result']
popit_congressmen.each do |congressman|
record = get_info congressman
post record
end
end
def get_info congressman
organizations = String.new
if !congressman['memberships'].empty?
congressman_organization_id = congressman['memberships'].first['organization_id']
organizations = get_memberships congressman_organization_id
end
record = {
'uid' => congressman['id'],
'name' => I18n.transliterate(congressman['name']),
'chamber' => congressman['title'],
'district' => congressman['represent'].first['district'].gsub('?','ta.'),
'commune' => I18n.transliterate(congressman['represent'].first['comunas']),
'region' => I18n.transliterate(congressman['represent'].first['region']),
'profile_image' => '',
'organization_id' => '',
'organizations' => organizations,
'date_scraped' => Date.today.to_s
}
if !congressman['images'].nil? then record['profile_image'] = congressman['images'].first['url'] end
if !organizations.empty? then record['organization_id'] = congressman_organization_id end
return record
end
def get_memberships organization_id
response = HTTParty.get(@location_organizations + organization_id, :content_type => :json)
response = JSON.parse(response.body)
popit_membership = response['result']
organizations = Array.new
organizations[0] = popit_membership['name']
i = 1
popit_membership['other_names'].each do |organization|
organizations[i] = organization['name']
i = i + 1
end
return organizations
end
end
# Runner
if !(defined? Test::Unit::TestCase)
CongressmenProfiles.new.process
end