-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathparse.py
125 lines (94 loc) · 4.82 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import argparse
import os.path
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
JS = "https://angel.co/company_filters/search_data"
HEADERS = {"X-Requested-With": "XMLHttpRequest",
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36"}
BASE_URL = "https://angel.co/companies/startups?ids%5B%5D={}&total={}&page={}&sort=signal&new=false&hexdigest={}"
DF_COLUMNS = ['name', 'desc', 'website', 'location', 'employees', 'raised', 'angel_url', 'angel_id']
CSV_FILENAME = 'all_companies.csv'
def parse_companies(companies):
df = pd.DataFrame(columns=DF_COLUMNS)
for idx, company in enumerate(companies):
if idx % 4 == 0:
print('{} company'.format(idx))
name = company.findAll("a", {"class": "startup-link"})[1].text
description = company.findAll("div", {"class": "pitch"})[0].text.strip('\n')
if len(description) == 0:
description = '-'
company_column = company.findAll("div", {"class": "company column"})[0]
angel_list_url = company_column.findAll('a', href=True)[0]['href']
location_tag = company.findAll("div", {"class": "location"})[0]
location = location_tag.findAll("div", {"class": "value"})[0].text.strip('\n')
employees_tag = company.findAll("div", {"class": "company_size"})[0]
employees = employees_tag.findAll("div", {"class": "value"})[0].text.strip('\n')
raised_tag = company.findAll("div", {"class": "raised"})[0]
raised = raised_tag.findAll("div", {"class": "value"})[0].text.strip('\n')
website_tag = company.findAll("div", {"class": "website"})[0]
a = website_tag.findAll('a', href=True)
website = '-'
if len(a) > 0:
website = a[0]['href']
angel_id = company.findAll("a", {"class": "startup-link"})[0]['data-id']
full_company = pd.DataFrame([[name, description, angel_list_url, location,
employees, raised, website, angel_id]], columns=DF_COLUMNS)
df = df.append(full_company)
return df
def get_next_pages(search_query='', start_page=1):
with requests.Session() as s:
response = s.post(JS, data={"sort": "signal", "page": start_page, 'filter_data[markets][]': search_query},
headers=HEADERS)
params = response.json()
companies = s.get(BASE_URL.format("&ids%5B%5D=".join(map(str, params["ids"])),
params["page"],
params["total"],
params["hexdigest"]), headers=HEADERS)
soup = BeautifulSoup(companies.json()["html"], "html.parser")
companies = soup.findAll(name="div", attrs={"class": "base startup"})
yield companies
while True:
# increment page count from previous.
page = params["page"] + 1
params = s.post(JS, data={"sort": "signal", "page": page}, headers=HEADERS).json()
# keep going until we have reached the maximum queries
if "ids" not in params:
break
companies = s.get(BASE_URL.format("&ids%5B%5D=".join(map(str, params["ids"])),
params["page"],
params["total"],
params["hexdigest"]),
headers=HEADERS)
soup = BeautifulSoup(companies.json()["html"], "html.parser")
companies = soup.findAll(name="div", attrs={"class": "base startup"})
# don't hammer with requests
time.sleep(.3)
yield companies
def create_csv():
df = pd.DataFrame(columns=DF_COLUMNS)
df.to_csv(CSV_FILENAME, index=None)
def add_parsed_companies_to_all(parsed_df):
if not os.path.isfile(CSV_FILENAME):
create_csv()
all_companies = pd.read_csv(CSV_FILENAME, index_col='name')
parsed_df = parsed_df.set_index('name')
all_companies = pd.concat([all_companies, parsed_df])
unique_companies = all_companies.drop_duplicates()
unique_companies.to_csv(CSV_FILENAME)
print('data has been written')
def start(query):
companies = get_next_pages(search_query=query)
df = pd.DataFrame(columns=DF_COLUMNS)
for idx, comps in enumerate(companies):
print('batch index {}'.format(idx))
parsed_companies = parse_companies(comps)
df = df.append(parsed_companies)
add_parsed_companies_to_all(df)
if __name__ == '__main__':
argument_parser = argparse.ArgumentParser()
argument_parser.add_argument("-q", "--query", required=False,
help="Search companies with specific market query")
args = argument_parser.parse_args()
start(args.query)