-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgoogleDownloader.py
61 lines (52 loc) · 1.72 KB
/
googleDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/python3
#
# Script to download spreadsheets from web
#
import optparse
import sys
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from multiprocessing import Queue
import threading
import logging
import ssl
class Downloader(threading.Thread):
def __init__(self, queue):
super(Downloader, self).__init__()
self.queue = queue
def run(self):
while True:
download_url, save_as = queue.get()
# sentinal
if not download_url:
return
try:
urllib.request.urlretrieve(download_url, filename=save_as)
except Exception as e:
logging.warning("error downloading %s: %s" % (download_url, e))
def main():
#URL = 'http://google.com/search?q=%(query)s&num=%(count)s&start=%(startindex)s'
ssl._create_default_https_context = ssl._create_unverified_context
page = requests.get("http://google.com/search?q=species+latitude+longitude+sample%20filetype%3Axls&num=300&start=1")
soup = BeautifulSoup(page.content)
links = soup.findAll("a")
for i in range(5):
threads.append(Downloader(queue))
threads[-1].start()
for link in soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)")):
link_href = (re.split(":(?=http)",link["href"].replace("/url?q=","")))
url = link_href[0].split("&")[0]
if url.endswith("xls"):
#print(url)
filename = url.split('/')[-1]
print(filename)
queue.put((url, filename))
#urllib.request.urlretrieve(principal)
for i in range(5):
queue.put((None, None))
if __name__ == "__main__":
queue = Queue()
threads = []
main()