forked from theriley106/IvyBound
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
240 lines (219 loc) · 7.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# encoding=utf8
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import requests
import bs4
import threading
import json
ALL = []
KEYWORDS = ["fall", "spring", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019"]
URL = "https://talk.collegeconfidential.com/columbia-school-general-studies/2126809-columbia-gs-fall-2019-early-regular-decision-thread-p22.html"
DB = json.load(open("all.json"))
THREADS = []
SEARCH_COUNT = {}
def grabSite(url):
for i in range(3):
SEARCH_COUNT[THREADS[-1]] += 1
print(url)
try:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
return requests.get(url, timeout=7, headers=headers)
except Exception as exp:
print exp
pass
return "<html></html>"
def is_stats(string):
if 'gpa' in string.lower() and (string.count(":") > 2 or string.count("-") > 2):
return True
else:
return False
def get_page_count(url):
res = grabSite(url)
page = bs4.BeautifulSoup(res.text, 'lxml')
try:
return int(page.select(".LastPage")[0].getText())
except:
return 2
def get_specific_comment(url):
# Extracts a specific comment ID from a forum page
commentID = url.partition("_")[2]
#print commentID
res = grabSite(url)
page = bs4.BeautifulSoup(res.text, 'lxml')
for val in page.select(".Role_RegisteredUser"):
if "/post/facebook/comment?id=" + commentID in str(val):
return val
def dig_further(stringVal):
# This means the comment on a users page *may* contain stats
# Enough of a chance to pull the full comment (Another API call)
s = stringVal.lower()
return 'accepted' in s or 'rejected' in s or 'decision' in s or '!' in s
def extract_url_from_item(itemVal):
# Extracts a comment URL from an item selection
return str(itemVal).partition('a href="')[2].partition('"')[0]
import re
def js_comment_clean(js):
js = re.sub("<!--[\\s\\S]*?(?:-->)?","",js)
js = re.sub("<!--[\\s\\S]*?-->?","",js)
js = re.sub('<!---+>?','',js)
js = re.sub("|<!(?![dD][oO][cC][tT][yY][pP][eE]|\\[CDATA\\[)[^>]*>?","",js)
js = re.sub("|<[?][^>]*>?","",js)
return js
def get_stats_from_profile(profileName):
# This function tries to extract stats based on a users profile ID
# If nothing is found it will return None
try:
url = "https://talk.collegeconfidential.com/profile/comments/{}".format(profileName)
comments = []
pages = None
while True:
#print url
res = grabSite(url)
page = bs4.BeautifulSoup(res.text, 'lxml')
if pages == None:
pages = range(2,len(range(0, int(page.select(".Posts b")[0].getText()), 20)))
#raw_input(pages)
#raw_input("CONTINUE")
for item in page.select(".Item"):
for val in item.select(".Message"):
if dig_further(val.getText()):
#print val.getText()
urlVal = extract_url_from_item(item)
comment = get_specific_comment(urlVal)
if comment != None:
if is_stats(comment.getText()):
return {'comment': comment, 'url': url}
if len(pages) == 0:
return
else:
#raw_input(urlVals[0])
url = "https://talk.collegeconfidential.com/profile/comments/{}?page=p{}".format(profileName, pages.pop(0))
except Exception as exp:
return None
def get_yearly_threads(url):
threads = []
res = grabSite(url)
page = bs4.BeautifulSoup(res.text, 'lxml')
for val in page.select("tr"):
x = str(val.select(".DiscussionName")[0].getText())
if "12" in x:
found = False
for v in KEYWORDS:
if v in str(x).lower():
found = True
if found == True:
for v in val.find_all('a', href=True):
if 'talk.collegeconfidential.com' in str(v['href']):
threads.append(v['href'])
break
return threads
def gen_thread_url(url, num):
# https://talk.collegeconfidential.com/columbia-school-general-studies/2036962-dual-ba-program-trinity-college-dublin-and-columbia-university-fall-2018.html
return url.partition(".html")[0] + "-p{}.html".format(num)
def extract_from_thread_url(threadName, url):
rCount = 0
aCount = 0
tempCount = get_page_count(url)
#print("SEARCHING: {} - {} pages".format(url, tempCount))
for i in range(1, tempCount+1):
res = grabSite(gen_thread_url(url, i))
page = bs4.BeautifulSoup(res.text, 'lxml')
for thread in page.select(".Role_RegisteredUser"):
comment = thread.select(".userContent")[0]
#raw_input(thread)
username = str(thread).partition("/profile/")[2].partition('"')[0]
#raw_input(thread)
if is_stats(str(comment.getText())):
if 'accepted' in str(comment.getText()).lower():
typeVal = "accepted"
#pass
elif 'rejected' in str(comment.getText()).lower() or 'rejection' in str(comment.getText()).lower():
typeVal = "rejected"
#pass
else:
typeVal = "unknown"
#pass
DB[threadName][typeVal].append({'urls': [url], 'type': "direct", "comment": str(thread)})
elif ('accepted' in str(comment.getText()).lower().split(" ")[:5] or 'rejected' in str(comment.getText()).lower().split(" ")[:5]):
fullComment = get_stats_from_profile(username)
x = fullComment
if x != None:
x = x['comment'].getText()
if 'accepted' in str(x).lower():
typeVal = "accepted"
#pass
elif 'rejected' in str(x).lower() or 'rejection' in str(x).lower():
typeVal = "rejected"
#pass
else:
typeVal = "unknown"
#pass
DB[threadName][typeVal].append({'urls': [url, str(fullComment['url'])], 'type': "profile", "comment": str(fullComment['comment'])})
rCount += str(comment).lower().count("rejected")
aCount += str(comment).lower().count("accepted")
x = {"url": url, "rCount": rCount, "aCount": aCount}
ALL.append(x)
#print x
return x
class Decision(object):
def __init__(self, username):
self.username = username
#.CountComments .Number
class Search(object):
def __init__(self, urlVal):
self.main_url = urlVal
self.thread = urlVal.partition(".com/")[2].partition("/")[0]
if self.thread not in DB:
DB[self.thread] = {'accepted': [], 'rejected': [], 'unknown': []}
print("Searching for {}".format(self.thread))
self.pages = get_page_count(self.main_url)
print self.pages
#self.pages = 3
self.all_threads = []
for i in range(1, self.pages+1):
for v in get_yearly_threads(self.main_url + "//p{}".format(i)):
#print v
# Example:
self.all_threads.append(v)
#print("{} Pages found in the {} thread".format(self.pages, self.thread))
#print("Valid Threads to search: {}".format(len(self.all_threads)))
threads = [threading.Thread(target=extract_from_thread_url, args=(self.thread, ar,)) for ar in self.all_threads]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
for val in ALL:
print val
with open('all.json', 'w') as outfile:
json.dump(DB, outfile, indent=4)
def search_all(thread, filterVal=None):
THREADS.append(thread)
if THREADS[-1] not in SEARCH_COUNT:
SEARCH_COUNT[THREADS[-1]] = 0
Search(thread)
x = json.load(open("all.json"))[thread.partition(".com/")[2].partition("/")[0]]
if filterVal != None:
if filterVal.lower() == 'freshman':
for v in x.keys():
toRemove = []
for val in x[v]:
if 'transfer' in str(val).lower():
toRemove.append(val)
for val in toRemove:
x[v].remove(val)
else:
for v in x.keys():
toRemove = []
for val in x[v]:
if filterVal.lower() not in str(val).lower():
toRemove.append(val)
for val in toRemove:
x[v].remove(val)
return x, SEARCH_COUNT[thread]
if __name__ == '__main__':
#thread = raw_input("College Confidential Thread URL: ")
thread = "https://talk.collegeconfidential.com/university-southern-california/"
thread = "https://talk.collegeconfidential.com/columbia-school-general-studies/"
# IE: https://talk.collegeconfidential.com/columbia-school-general-studies/
cc = Search(thread)