forked from NYUAD-Hackathon-2018/Tataoua
-
Notifications
You must be signed in to change notification settings - Fork 0
/
parsing.py
51 lines (36 loc) · 3.24 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
from bs4 import BeautifulSoup
import wget
from pyppeteer import launch
import asyncio
# <p class="searchitem_desc ellipsis_vert">
# filename = wget.download("https://www.volunteermatch.org/search/index.jsp#k=&v=false&s=1&o=distanceBand&l=United+States&r=20&sk=&specialGroupsData.groupSize=&na=&partner=&usafc=")
async def main():
for k in range(1,200):
browser = await launch()
page = await browser.newPage()
await page.goto("https://www.volunteermatch.org/search/?aff=&includeOnGoing=true&r=country&l=United+States&o=distanceBand&s="+str(k)+"1%20search_pages#k=&v=false&s="+str(k)+"1&o=distanceBand&l=New+York%2C+NY%2C+USA&r=20&sk=&specialGroupsData.groupSize=&na=&partner=&usafc=")
await page.waitFor(2000)
html = await page.content()
# filename = wget.download("https://www.volunteermatch.org/search/?aff=&includeOnGoing=true&r=country&l=United+States&o=distanceBand&s="+str(k)+"1%20search_pages#k=&v=false&s="+str(k)+"1&o=distanceBand&l=New+York%2C+NY%2C+USA&r=20&sk=&specialGroupsData.groupSize=&na=&partner=&usafc=")
print("https://www.volunteermatch.org/search/?aff=&includeOnGoing=true&r=country&l=United+States&o=distanceBand&s="+str(k)+"1%20search_pages#k=&v=false&s="+str(k)+"1&o=distanceBand&l=New+York%2C+NY%2C+USA&r=20&sk=&specialGroupsData.groupSize=&na=&partner=&usafc=")
soup = BeautifulSoup(html, 'html.parser')
for i in soup.findAll("a", {"link_type":"opp"}):
filename = wget.download("https://www.volunteermatch.org"+i['href'], "files")
with open(filename) as innerFile:
text = i.text
soup2 = BeautifulSoup(innerFile, 'html.parser')
tags = []
for j in soup2.findAll("img", {"class":"sprite_profile"}):
print(j['title'])
tags.append(j['title'])
outputFile = open("output.txt", "a")
for i in tags:
outputFile.write("__label__"+i.replace(" ","")+" ")
outputFile.write(text+"\n")
outputFile.close()
asyncio.get_event_loop().run_until_complete(main())
# https://www.volunteermatch.org/search/?aff=&includeOnGoing=true&r=country&l=United+States&o=distanceBand&s=21%20search_pages#k=&v=false&s=31&o=distanceBand&l=New+York%2C+NY%2C+USA&r=20&sk=&specialGroupsData.groupSize=&na=&partner=&usafc=
# https://www.volunteermatch.org/search/?aff=&includeOnGoing=true&r=country&l=United+States&o=distanceBand&s=21%20search_pages#k=&v=false&s=21&o=distanceBand&l=New+York%2C+NY%2C+USA&r=20&sk=&specialGroupsData.groupSize=&na=&partner=&usafc=
# https://www.volunteermatch.org/search/?aff=&includeOnGoing=true&r=country&l=United+States&o=distanceBand&s=21%20search_pages#k=&v=false&s=11&o=distanceBand&l=New+York%2C+NY%2C+USA&r=20&sk=&specialGroupsData.groupSize=&na=&partner=&usafc=
# https://www.volunteermatch.org/search/?aff=&includeOnGoing=true&r=country&l=United+States&o=distanceBand&s=11 search_pages
# original : https://www.volunteermatch.org/search/index.jsp#k=&v=false&s=1&o=distanceBand&l=United+States&r=20&sk=&specialGroupsData.groupSize=&na=&partner=&usafc=