-
Notifications
You must be signed in to change notification settings - Fork 0
/
parser.rb
52 lines (43 loc) · 1.17 KB
/
parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
require 'rubygems'
require 'mechanize'
module Parsers
class Lansstyrelsen
attr_accessor :data
BASE_URL = 'http://web05.lansstyrelsen.se/stift/StiftWeb/'\
'FoundationDetails.aspx?id='
def initialize(limit = nil)
@limit = limit || 1_000_000_000
@data = []
@agent = Mechanize.new do |agent|
agent.user_agent_alias = 'Mac Safari'
end
end
def get_all_pages(limit_fails = 1_000)
count_fails, page_num = 0, 1
loop do
@page = @agent.get("#{BASE_URL}#{page_num}")
current_data = scrape_data
unless current_data["stiftelsenamn"].empty?
@data << current_data
else
count_fails += 1
end
break if page_num > @limit || count_fails > limit_fails
page_num += 1
end
end
# Scrape data from table
def scrape_data
rows = @page.search("//tr[contains(@class,'Item')]")
data = { }
rows.each do |row|
cells = row.search("td")
data[cells.first.text.downcase.gsub(/\\|:/, '')] = cells.last.text
end
data
end
end
end
# parser = Parsers::Lansstyrelsen.new
# parser.get_all_pages
# pp parser.data