-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprobgate_contests_scraper.py
383 lines (310 loc) · 13.5 KB
/
probgate_contests_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
import os
import re
import json
import time
import logging
import zipfile
import io
import requests
from bs4 import BeautifulSoup
from dotenv import load_dotenv
# Load environment variables
load_dotenv()
logging.basicConfig(level=logging.WARNING)
# Constants
REQUEST_DELAY = 0.12 # seconds between requests
# Contest cutoff - don't scrape contests after this date
CUTOFF_MONTH = 1
CUTOFF_YEAR = 25
def parse_contest_date(contest_name):
"""Parse month and year from contest name (e.g., 'DEC24 Bronze')"""
month_map = {
'DEC': 12,
'JAN': 1,
'FEB': 2,
'OPEN': 3, # OPEN happens in March
}
match = re.match(r'([A-Z]+)(\d+)', contest_name)
if not match:
return None, None
month_str, year_str = match.groups()
month = month_map.get(month_str)
if month is None:
return None, None
try:
year = int(year_str)
return month, year
except ValueError:
return None, None
def is_contest_before_cutoff(contest_name):
"""Check if contest is before or at the cutoff date"""
month, year = parse_contest_date(contest_name)
if month is None or year is None:
return False
# Convert 2-digit year to comparable format
if year > 90: # Assume 90-99 means 1990-1999
year += 1900
else: # Assume 00-89 means 2000-2089
year += 2000
cutoff_year = 2000 + CUTOFF_YEAR
# Compare years first
if year < cutoff_year:
return True
if year > cutoff_year:
return False
# If same year, compare months
return month <= CUTOFF_MONTH
def parse_contest_info(contest_name):
"""Parse contest name into month, year, and division"""
# Skip contests that don't match our expected format
match = re.match(r'([A-Z]+)(\d+)\s+([A-Za-z]+)', contest_name)
if not match:
logging.warning(f"Contest name '{contest_name}' does not match expected format")
return None
month, year, division = match.groups()
return {
'month': month,
'year': year,
'division': division
}
def get_linked_problem_id(session, problem_id):
"""Check if a problem is a link and return the linked problem ID if it is"""
edit_url = f'https://probgate.org/probgate/edit.php?pid={problem_id}'
try:
response = session.get(edit_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
statement_text = soup.find('textarea', {'id': 'statement_text'})
if statement_text:
content = statement_text.text.strip()
# Look for link pattern [a|https://probgate.org/viewproblem.php?pid=XXXX]Link[/a]
link_match = re.search(r'\[a\|https://probgate\.org/viewproblem\.php\?pid=(\d+)\]Link\[/a\]', content)
if link_match:
return link_match.group(1)
return None
except requests.RequestException as e:
logging.error(f"Error checking if problem {problem_id} is a link: {e}")
return None
def get_contest_problems(session, contest_id):
"""Fetch and parse problems for a specific contest"""
config_url = f'https://probgate.org/contest/config.php?cid={contest_id}'
try:
response = session.get(config_url)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
problems_div = soup.find('div', id='problems')
if not problems_div:
logging.warning(f"No problems div found for contest {contest_id}")
return []
problems = []
for row in problems_div.find_all('tr')[1:]: # Skip header row
cols = row.find_all('td')
if len(cols) >= 2:
problem_id = cols[0].text.strip()
problem_link = cols[1].find('a')
if problem_link:
problem_name = problem_link.text.strip()
if problem_name.endswith(' (Link)'):
problem_name = problem_name[:-7] # Remove ' (Link)' suffix
# Check if this is a linked problem
linked_id = get_linked_problem_id(session, problem_id)
if linked_id:
print(f"Problem {problem_name} (ID: {problem_id}) is a link to problem {linked_id}")
problem_id = linked_id
problems.append({
'problem_id': problem_id,
'name': problem_name
})
# Add a small delay to avoid overwhelming the server
time.sleep(REQUEST_DELAY)
return problems
except requests.RequestException as e:
logging.error(f"Error fetching problems for contest {contest_id}: {e}")
return []
def get_problem_zip(session, problem_id):
"""Download and extract problem ZIP file"""
export_url = f'https://probgate.org/probgate/export.php?pid={problem_id}'
# Data for the export request
data = {
'statement': 'on',
'analysis': 'on',
'render': 'on',
'attachments': 'on',
'tests': 'on',
'grader': 'on',
'scorer': 'on',
'validator': 'on',
'solutions': 'on',
'generators': 'on',
'submissions': 'on',
'archive': 'zip',
'export': 'Export'
}
try:
# Make the export request
headers = {
'Referer': f'https://probgate.org/probgate/export.php?pid={problem_id}'
}
response = session.post(export_url, data=data, headers=headers)
response.raise_for_status()
# Create the data_private/probgate/problems directory if it doesn't exist
os.makedirs('data_private/probgate/problems', exist_ok=True)
# Create temporary problem directory
tmp_dir = os.path.join('data_private/probgate/problems', f"{problem_id}.tmp")
final_dir = os.path.join('data_private/probgate/problems', str(problem_id))
# Remove tmp_dir if it exists (from a previous failed attempt)
if os.path.exists(tmp_dir):
import shutil
shutil.rmtree(tmp_dir)
os.makedirs(tmp_dir)
# Extract the ZIP file to temporary directory
with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
zip_ref.extractall(tmp_dir)
# Rename temporary directory to final directory
if os.path.exists(final_dir):
import shutil
shutil.rmtree(final_dir)
os.rename(tmp_dir, final_dir)
print(f"Successfully downloaded and extracted problem {problem_id}")
return True
except requests.RequestException as e:
logging.error(f"Error downloading problem {problem_id}: {e}")
return False
except zipfile.BadZipFile as e:
logging.error(f"Error extracting ZIP for problem {problem_id}: {e}")
return False
except Exception as e:
logging.error(f"Error processing problem {problem_id}: {e}")
# Clean up temporary directory if it exists
if os.path.exists(tmp_dir):
import shutil
shutil.rmtree(tmp_dir)
return False
def scrape_problems(session, contests):
"""Download problem ZIP files"""
for contest in contests.values():
if 'problems' not in contest:
continue
for problem in contest['problems']:
problem_id = problem['problem_id']
problem_dir = os.path.join('data_private/probgate/problems', str(problem_id))
# Skip if we already have this problem
if os.path.exists(problem_dir):
print(f"Skipping problem {problem['name']} (ID: {problem_id}) - already downloaded")
continue
print(f"Downloading problem {problem['name']} (ID: {problem_id})...")
get_problem_zip(session, problem_id)
# Add a small delay between downloads
time.sleep(REQUEST_DELAY)
def load_existing_contests():
"""Load existing contests from JSON file if it exists"""
try:
with open('data_private/probgate/contests.json', 'r', encoding='utf-8') as f:
return {contest['contest_id']: contest for contest in json.load(f)}
except (FileNotFoundError, json.JSONDecodeError):
return {}
def save_contests(contests):
"""Save contests to JSON file"""
os.makedirs('data_private/probgate', exist_ok=True)
with open('data_private/probgate/contests.json', 'w', encoding='utf-8') as f:
json.dump(list(contests.values()), f, indent=2)
def login_to_probgate():
"""Log in to Probgate and return a session"""
session = requests.Session()
# Get login credentials from environment variables
username = os.getenv('PROBGATE_USERNAME')
password = os.getenv('PROBGATE_PASSWORD')
if not username or not password:
raise ValueError("PROBGATE_USERNAME and PROBGATE_PASSWORD must be set in .env file")
# Log in to Probgate
login_url = 'https://probgate.org/login.php'
login_data = {
'user': username,
'password': password,
}
try:
# First, get the login page to capture any CSRF token if needed
login_page = session.get(login_url)
login_page.raise_for_status()
# Perform login with referrer header
headers = {
'Referer': 'https://probgate.org/login.php'
}
response = session.post(login_url, data=login_data, headers=headers)
response.raise_for_status()
# Check if login was successful by looking for common failure indicators
if 'incorrect' in response.text.lower() or 'failed' in response.text.lower():
logging.error("Login failed. Please check your credentials.")
return None
return session
except requests.RequestException as e:
logging.error(f"Error logging in to Probgate: {e}")
return None
def scrape_probgate():
# Load existing contests
existing_contests = load_existing_contests()
# Create a session to maintain cookies
session = login_to_probgate()
if not session:
return None, None
# Get the target page
target_url = 'https://probgate.org/contest/contestgate.php'
try:
# Get the target page
contest_page = session.get(target_url)
contest_page.raise_for_status()
# Parse the page content
soup = BeautifulSoup(contest_page.text, 'html.parser')
# Find all contest tables - they have class 'subtable sortable'
tables = soup.find_all('table', {'class': 'subtable sortable'})
if not tables:
print("No contest tables found")
return None, None
contests = {}
for table in tables:
# Skip header row
for row in table.find_all('tr')[1:]:
cols = row.find_all('td')
if len(cols) >= 2:
contest_id = cols[0].text.strip()
contest_link = cols[1].find('a')
if contest_link:
contest_name = contest_link.text.strip()
# Skip contests after cutoff date
if not is_contest_before_cutoff(contest_name):
print(f"Skipping {contest_name} (ID: {contest_id}) - after cutoff date or invalid date")
continue
# Skip if we already have this contest and its problems
if contest_id in existing_contests and 'problems' in existing_contests[contest_id]:
print(f"Skipping {contest_name} (ID: {contest_id}) - already scraped")
contests[contest_id] = existing_contests[contest_id]
continue
info = parse_contest_info(contest_name)
if info:
print(f"Scraping {contest_name} (ID: {contest_id})...")
problems = get_contest_problems(session, contest_id)
contests[contest_id] = {
'contest_id': contest_id,
'name': contest_name,
'month': info['month'],
'year': info['year'],
'division': info['division'],
'problems': problems
}
# Save progress after each contest
save_contests(contests)
# Add a small delay between requests
time.sleep(REQUEST_DELAY)
print(f"\nSuccessfully saved {len(contests)} contests to 'data_private/probgate/contests.json'")
return session, contests
except requests.RequestException as e:
logging.error(f"Error fetching contests: {e}")
return None, None
def main():
session, contests = scrape_probgate()
if session and contests:
# Download problem ZIPs
print("\nDownloading problem files...")
scrape_problems(session, contests)
if __name__ == "__main__":
main()