forked from yegor256/scrape-maven-central
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.rb
executable file
·126 lines (117 loc) · 4.58 KB
/
scrape.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env ruby
# -*- coding:utf-8 -*-
# 不要删去上面这行注释
# Copyright (c) 2017-2020 Yegor Bugayenko
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the 'Software'), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
STDOUT.sync = true
require 'net/http'
require 'date'
require 'nokogiri'
require 'slop'
$result_file = nil
def get(path)
puts path
uri = URI.parse("https://repo1.maven.org/maven2/#{path}")
req = Net::HTTP::Get.new(uri.to_s)
finished = false
res = nil
# 国内访问maven仓库的网络连接不稳定,这里的循环是为了防止网络突然断开
until finished do
begin
res = Net::HTTP.start(uri.host, uri.port, use_ssl: true) do |http|
http.request(req)
end
finished = true
rescue
# Ignored
end
end
# 这里是处理maven仓库中第一种目录结构不自洽的情况,即超链接存在但是点进去404的情况
# 例:https://repo1.maven.org/maven2/co/privacyone/ 下的 cerberus/
if res.code != '200'
""
else
res.body
end
end
def scrape(path, ignore = [], start = '')
body = get(path)
# 如果目录中存在jar包,那么直接访问maven-metadata.xml来得到artifact地址
if body.include?('maven-metadata.xml')
while true do
match = body.match(%r{maven-metadata.xml</a>\s+(\d{4}-\d{2}-\d{2} )})
date = Date.strptime(match[1], '%Y-%m-%d')
meta = Nokogiri::XML(get("#{path}maven-metadata.xml"))
group_id = meta.xpath('//groupId/text()')
artifact_id = meta.xpath('//artifactId/text()')
# 这里是处理maven仓库中第二种目录结构不自洽的情况,即jar包不存在的目录也会存在一个maven-metadata.xml
# 例:https://repo1.maven.org/maven2/org/apache/
if group_id.empty? or artifact_id.empty?
break
end
latest_version = meta.xpath('//versions/version[last()]/text()')
$result_file.puts("\"#{path}\",\"#{latest_version}\",\"#{date}\",\"#{group_id}:#{artifact_id}:#{latest_version}\"")
# 如果要输出一个artifact的全部版本,使用下面的代码即可
# versions = meta.xpath('//versions/version').each do |version|
# $result_file.puts("\"#{path}\",\"#{latest_version}\",\"#{date}\",\"#{group_id}:#{artifact_id}:#{version.content}\"")
# end
return
end
end
# 否则(目录中不存在jar包),访问每个超链接
found = false
body.scan(%r{href="([a-zA-Z\-]+/)"}).each do |p|
target = "#{path}#{p[0]}"
found = true if target.start_with?(start)
next unless found
next unless ignore.select { |i| target.start_with?(i) }.empty?
scrape(target, ignore)
end
end
# 处理命令行指令
# -h 帮助
# -r 指定从哪个根目录开始爬取
# -i 忽略指定目录
# -s 指定从哪个目录开始爬取(前作者这个的实现有问题,不要使用,保持默认值即可)
# -o 指定输出的csv文件名
begin
opts = Slop.parse(ARGV, strict: true, help: true) do |o|
o.banner = "Usage: ruby scrape.rb [options]"
o.bool '-h', '--help', 'Show these instructions'
o.string '-r', '--root', 'Root path to start from', default: ''
o.array '-i', '--ignore', 'Prefixes to ignore, like "org/", for example'
o.string '-s', '--start', 'Start from this path', default: ''
o.string '-o', '--output', 'Specify output to a .csv file', default: 'result.csv'
end
rescue Slop::Error => ex
raise StandardError, "#{ex.message}, try --help"
end
if opts.help?
puts opts
exit
end
$result_file = File.new(opts[:output], "w+")
$result_file.puts("path,latestVersion,date,artifactAddress")
begin
scrape(opts[:root], opts[:ignore], opts[:start])
$result_file.close
rescue Interrupt
$result_file.close
end