-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathspider.rb
99 lines (77 loc) · 2.43 KB
/
spider.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# Copyright 2011 Marco Dinacci <[email protected]> / www.intransitione.com
#
# Hi, this program reads jobs listings from the careers.stackoverflow.com website and
# dump it on a file. It then read back the data and output JSON files ready to be
# used with the Google Visualization API.
#
# You are free to do what you want with it except pretend that you wrote it.
# If you redistribute it, keep the copyright line above.
#
# This module contains the web crawler.
module Spider
require 'rubygems'
require 'nokogiri'
require 'open-uri'
require_relative 'job'
class DefaultSpiderConfiguration
def self.source
return "http://careers.stackoverflow.com/jobs"
end
def self.jobs
return "//div[@class='list jobs']/div[@data-jobid]"
end
def self.title
return ".//a[@class='job-link']/@title"
end
def self.score
return ".//span[@class='joeltestscore']/text()"
end
def self.location
return ".//p[@class='location']/text()"
end
def self.tags
return ".//a[@class='post-tag job-link']/text()"
end
def self.description
return ".//p[@class='description']/text()"
end
end
class StackOverflowSpider
def initialize config
@config = config
end
def crawl
jobs = []
for pagenum in 0..10
url = @config.source << "?pg=" << pagenum.to_s
doc = Nokogiri::HTML(open(url))
doc.xpath(@config.jobs).each do |jobElement|
job = Job::Job.new
job.title = jobElement.xpath(@config.title).to_s
job.score = jobElement.xpath(@config.score).to_s.to_i
#jobElement.xpath(@config.tags).each {|tag| job.tags.push(tag.to_s.downcase)}
jobElement.xpath(@config.tags).each do |tag|
# refine a few tags that are semantically the same but have different text
tag = tag.to_s.downcase
if tag == "html5" || tag == "html4"
tag = "html"
end
if tag == "rails" || tag == "ror"
tag = "ruby-on-rails"
end
if tag == "css3"
tag = "css"
end
job.tags.push(tag)
end
locations = jobElement.xpath(@config.location).to_s
# Remove annoying and split string to obtain an array of locations
job.locations = locations.gsub!(/( |\s)+/, " ").split(';')
job.description = jobElement.xpath(@config.description).to_s
jobs.push(job)
end
end
return jobs
end
end
end