-
Notifications
You must be signed in to change notification settings - Fork 9
/
ghUpdatedMirror.py
123 lines (109 loc) · 3.39 KB
/
ghUpdatedMirror.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
'''
Script to scrape GitHub repos using the GraphQL API
Obtains all repos that have been updated AFTER a specified date
Scrapes all repos from that date up to the current time
'''
import requests
import json
import pymongo
from datetime import datetime, timedelta
import time
import sys
# get start and end date, and GITHUB API token from command line
token = sys.stdin.readline().strip()
# DB info
client = pymongo.MongoClient()
dbName = sys.argv[1] # db name as second arg
collName = sys.argv[2] # coll name as third arg
db = client[dbName]
coll = db[collName]
url = 'https://api.github.com/graphql'
headers = {'Authorization': 'token ' + token}
total = 0
remaining = 5000
# query that specifies which repos and what content to extract
query = '''{
rateLimit {
cost
remaining
resetAt
}
search(query: "is:public archived:false mirror:true", type: REPOSITORY, first: 100) {
repositoryCount
pageInfo {
hasNextPage
endCursor
startCursor
}
nodes {
... on Repository {
nameWithOwner
updatedAt
createdAt
pushedAt
id
forkCount
isMirror
parent { nameWithOwner }
description
}
}
}
}'''
jsonS = { 'query': query }
# wait for reset if we exhaust our number of calls
def wait(reset):
now = datetime.now()
then = datetime.strptime(reset, "%Y-%m-%dT%H:%M:%SZ")
wait = (then-now).total_seconds() + 30
time.sleep(wait)
# helper function to loop through and insert repos into mongo db
def gatherData (res):
global total
repos = res['data']['search']['nodes']
for i in repos:
coll .insert(i)
total += len(repos)
output = "Got {} repos. Total count is {}. Have {} calls remaining."
print (output.format(len(repos), total, remaining))
# driver loop that iterates through repos in 10 minute intervals
# iterates from the specified date up to the current time
nextQuery = query
jsonS['query'] = nextQuery
if (token == ''):
print("Please provide your Github API token in the script. Exiting.")
sys.exit()
r = requests.post(url=url, json=jsonS, headers=headers)
if r.ok:
try:
print("did it come here?")
res = json.loads(r.content)
remaining = res['data']['rateLimit']['remaining']
reset = res['data']['rateLimit']['resetAt']
if remaining < 11:
wait(reset)
repos = res['data']['search']['repositoryCount']
hasNextPage = res['data']['search']['pageInfo']['hasNextPage']
gatherData(res)
# check if we got more than 100 results and need to paginate
while (repos > 100 and hasNextPage):
endCursor = res['data']['search']['pageInfo']['endCursor']
print("Have to paginate, using cursor {}".format(endCursor))
index = nextQuery.find("REPOSITORY") + len("REPOSITORY")
pageQuery = nextQuery[:index] + ',after:"{}"'.format(endCursor) + nextQuery[index:]
jsonS['query'] = pageQuery
r = requests.post(url=url, json=jsonS, headers=headers)
if r.ok:
res = json.loads(r.text)
try:
remaining = res['data']['rateLimit']['remaining']
reset = res['data']['rateLimit']['resetAt']
if remaining < 11:
wait(reset)
repos = res['data']['search']['repositoryCount']
hasNextPage = res['data']['search']['pageInfo']['hasNextPage']
gatherData(res)
except Exception as e:
print(e)
except Exception as e:
print(e)