-
Notifications
You must be signed in to change notification settings - Fork 0
/
gscrap.py
85 lines (61 loc) · 2.6 KB
/
gscrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
## IMPORTING ALL THE LIBRARIES NEEDED
import os
import io
import time
import wget
import requests
from PIL import Image
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
# Initializing Chrome Web Driver and its Configuration
options = webdriver.ChromeOptions()
# chrome_options.add_argument("--headless")
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument("window-size=1920x1080")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--ignore-ssl-errors")
options.add_argument('--disable-blink-features=AutomationControlled')
browser = webdriver.Chrome('chromedriver',options=options)
## This function is selenium automating scrolling, clicking on images finds the high resolution image and returns the url
def getimg_FROMGOOGLE(wd, delay, max_imgs):
## this function scrolls uptil the end in your browser
def scroll_down(wd):
wd.execute_script("window.scrollTo(0,document.body.scrollHeight);")
time.sleep(0)
url = "https://www.google.com/search?q=african+traditional+footwears&sxsrf=ALiCzsaHkM4jvudfOWkTgfPXnAnPvEf0jA:1656342359384&source=lnms&tbm=isch&sa=X&ved=2ahUKEwieidSP9M34AhVgRvEDHfS-B_wQ_AUoAXoECAEQAw&biw=1366&bih=663&dpr=1"
wd.get(url)
img_urls = set()
skips = 0
while len(img_urls) < max_imgs:
scroll_down(wd)
thumbnails = wd.find_elements(By.CLASS_NAME, "Q4LuWd") # YOU JUST NEED TO FIND THE CLASS NAME THAT MATCHES YOURS
for imgs in thumbnails[len(img_urls): max_imgs]:
try:
imgs.click()
time.sleep(1)
#print(imgs)
except:
continue
iMG = wd.find_elements(By.CLASS_NAME, "n3VNCb") # FIND THE CLASS NAME THAT MATCHES YOURS
for image in iMG:
if image.get_attribute('src') in img_urls:
max_imgs += 1
skips += 1
break
if image.get_attribute('src') and 'http' in image.get_attribute('src'):
img_urls.add(image.get_attribute('src'))
print(f"FOUND: {len(img_urls)} ")
return img_urls
counter = 0
path = "/content/drive/MyDrive" # Path you want to save your image
urls = getimg_FROMGOOGLE(browser,1,8) # calling the function
# This line gets the images based on urls, downloads it and saves
for image in urls:
keyword = image[-10:-6]
save_as = os.path.join(path, keyword + str(counter) + '.jpg')
wget.download(image, save_as)