forked from ssc-oscar/gather
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sfRepos.py
executable file
·68 lines (56 loc) · 2.04 KB
/
sfRepos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/python3
from urllib import request
import xml.etree.ElementTree as ET
import re, itertools, os, pymongo, sys
# DB info
client = pymongo.MongoClient()
dbname = sys.argv[1] # expects db name as first argument
collName = sys.argv[2] # expect collection name as second arg
db = client[dbname]
coll = db[collName]
# uri used to properly parse the XML files
uri = '{http://www.sitemaps.org/schemas/sitemap/0.9}'
# The base url for every project that we append project name to
url = 'https://sourceforge.net/projects/'
# Regex used to extract the project name from the XML values
p = re.compile('projects/(.+?)/')
# A set to keep track of all UNIQUE project names
projects = set()
def parseXML(fname):
"""
Function that parses the XML file named fname
Iterates through XML starting at the root and visits each child
Each child that contains a project name is added to the set
"""
tree = ET.parse(fname)
root = tree.getroot()
for repo in root:
for elem in repo:
if (elem.tag == uri + 'loc'):
res = p.search(elem.text)
if res != None:
proj = res.group(1)
projects.add(url + proj)
def get(page):
"""
Function that downloads the XML file of specific page and parses
The XML file is saved into dest and then removed after parsing
"""
base = 'https://sourceforge.net/sitemap-{}.xml'.format(page)
dest = 'sitemap-{}.xml'.format(page)
try:
print('Downloading ' + base)
request.urlretrieve(base, dest)
parseXML(dest)
os.remove(dest)
except Exception as e:
print('Download ERROR: ', str(e))
# Driver loop to get all 172 site mappings
# Number of mappings found at https://sourceforge.net/sitemap.xml
for i in range(0, 173):
get(i)
# Insert all projects into collection
for i, proj in enumerate(itertools.islice(projects, len(projects))):
coll.insert({"url": proj, "source": "SourceForge", "git": None})
# Print how many projects we found
print("# projects: " + str(len(projects)))