-
Notifications
You must be signed in to change notification settings - Fork 9
/
ghFork.py
126 lines (110 loc) · 3.52 KB
/
ghFork.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
'''
Script to scrape GitHub repos using the GraphQL API
Obtains all repos that have been updated AFTER a specified date
Scrapes all repos from that date up to the current time
'''
import requests
import json
import pymongo
from datetime import datetime, timedelta
import time
import sys
# DB info
client = pymongo.MongoClient()
dbName = sys.argv[1] # db name as second arg
collName = sys.argv[2] # coll name as third arg
token = sys.argv[3]
db = client[dbName]
coll = db[collName]
url = 'https://api.github.com/graphql'
headers = {'Authorization': 'token ' + token}
total = 0
remaining = 5000
# query that specifies which repos and what content to extract
query = '''{
rateLimit {
cost
remaining
resetAt
}
user (login:"%s") {
login
repositories (first:100) {
totalCount
pageInfo {
hasNextPage
endCursor
startCursor
}
nodes {
nameWithOwner
parent { nameWithOwner }
isFork
}
}
}
}'''
jsonS = { 'query': query }
# wait for reset if we exhaust our number of calls
def wait(reset):
now = datetime.now()
then = datetime.strptime(reset, "%Y-%m-%dT%H:%M:%SZ")
wait = (then-now).total_seconds() + 30
time.sleep(wait)
# helper function to loop through and insert repos into mongo db
def gatherData (res):
global total
repos = res['data']['user']['repositories']['nodes']
#dt = res['data']['search']['nodes']
for i in repos:
coll.insert(i)
#for repo in repos:
# coll.insert({**repo['node'],**{'period': begin}})
total += len(repos)
output = "Got {} repos. Total count is {}. Have {} calls remaining."
print (output.format(len(repos), total, remaining))
for line in sys.stdin:
# driver loop that iterates through repos in 10 minute intervals
# iterates from the specified date up to the current time
owner = line .rstrip ()
nextQuery = query % (owner)
jsonS['query'] = nextQuery
#print(nextQuery)
if (token == ''):
print("Please provide your Github API token in the script. Exiting.")
sys.exit()
r = requests.post(url=url, json=jsonS, headers=headers)
if r.ok:
try:
print("did it come here?")
print (r.text)
res = json.loads(r.content)
remaining = res['data']['rateLimit']['remaining']
reset = res['data']['rateLimit']['resetAt']
if remaining < 11:
wait(reset)
repos = res['data']['user']['repositories']['totalCount']
hasNextPage = res['data']['user']['repositories']['pageInfo']['hasNextPage']
gatherData(res)
# check if we got more than 100 results and need to paginate
while (repos > 100 and hasNextPage):
endCursor = res['data']['user']['repositories']['pageInfo']['endCursor']
print("Have to paginate, using cursor {}".format(endCursor))
index = nextQuery.find("REPOSITORY") + len("REPOSITORY")
pageQuery = nextQuery[:index] + ',after:"{}"'.format(endCursor) + nextQuery[index:]
jsonS['query'] = pageQuery
r = requests.post(url=url, json=jsonS, headers=headers)
if r.ok:
res = json.loads(r.text)
try:
remaining = res['data']['rateLimit']['remaining']
reset = res['data']['rateLimit']['resetAt']
if remaining < 11:
wait(reset)
repos = res['data']['user']['repositories']['totalCount']
hasNextPage = res['data']['user']['repositories']['pageInfo']['hasNextPage']
gatherData(res)
except Exception as e:
print(e)
except Exception as e:
print(e)