-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathimage_extractor.py
161 lines (126 loc) · 4.42 KB
/
image_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# IMAGE EXTRACTOR
# A simple tool for searching and downloading images from bing search
# use Python version > 3.7
import os
import random
import time
from string import ascii_letters
import requests
from bs4 import BeautifulSoup
def generate_random_sequence() -> str:
"""Generates a sequence of random letters and numbers,
to be used in naming images later.
Returns
--------
name: str
A name made up of 10 letters and 5 numbers randomly.
example: "RbCsfpoxqM8686241812"
"""
name = ""
# Add 10 random letters to the string
for i in range(0, 10):
letter = random.choice(ascii_letters)
name += letter
# Add 5 random numbers to the string
for i in range(0, 5):
number = random.randint(0, 100)
name += str(number) # str() is used to typecast number to a string
return name
def write_image_from_link(link, output_folder="Results") -> int:
"""Downloads the image from link and writes it to a JPEG file
having random name generated from generate_random_sequence()
Parameters
----------
link: str
URL of the image
example: "https://media.sciencephoto.com/image/e1300006/800wm/E1300006-Avalanche.jpg".
output_folder: str (DEFAULT="Results")
output folder where images will be downloaded to
example: Results
Returns
------
:int
0 if the download and write operation is sucessful
1 if the download and write operation failed
"""
# if the folder doesn't exists then create it.
if not os.path.isdir(output_folder):
os.mkdir(output_folder)
print(f"[+] Creating directory {output_folder}")
# generate a random name for creating file and add .jpeg to that name
img_f_name = generate_random_sequence() + ".jpeg"
img_path = os.path.join(output_folder, img_f_name)
# get image data by GET request
response = requests.get(link)
# exit if the download failed.
if (response.status_code) != 200:
print(f"CONNECTION ERROR: Failed to download from {link}")
return 1
image_data = response.content
# open a file pointer and write the binary data to image
fp = open(img_path, "wb")
fp.write(image_data)
return 0
def extract_links_from_soup(soup) -> list:
"""Extracts image links from soup.
Parameters
-----------
soup: BeautifulSoup
BeautifulSoup object from where we will extract links
Returns
-------
links: list
List of image links.
"""
links = []
image_elems = soup.find_all("a", {"class":"iusc"})
for elem in image_elems:
image_info = eval(elem["m"]) #convert string json key:value data to python dict
img_url = image_info["murl"]
links.append(img_url)
return links
def intro():
print("$"*37)
print("$"*10, "IMAGE EXTRACTOR", "$"*10)
print("$"*37)
print("\nAUTHOR: SHAIKH AQUIB\n")
def outro():
print("Bye :)")
def main():
"""Main routine."""
# it is necessary to spoof user agent otherwise bing might reject requests
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:65.0) Gecko/20100101 Firefox/65.0"
request_header = {"user-agent":USER_AGENT}
img_count = 1
intro()
# Start the main process
while (1):
print("*** enter q to quit ***")
term = input("Enter the search term: ")
if term == "q":
outro()
break
img_count = int(input("How many images do you want? (1-35): "))
response = requests.get(
url = f"https://www.bing.com/images/search?q={term}&safeSearch=off",
headers = request_header
)
# If the request is not sucessful then skip
if (response.status_code != 200):
print(f"[!] Error: request for {term} returned with {response.status_code}")
continue
# parse the HTML source
soup = BeautifulSoup(response.content, "html.parser")
image_urls = extract_links_from_soup(soup)
total_images = len(image_urls)
for i in range(0, img_count):
# break if total images have been read from list
if (i >= total_images):
break
url = image_urls[i]
print(f"[+] Extracting image #{i+1}")
status = write_image_from_link(url)
if status == 1:
print("[!] Failed")
if __name__ == "__main__":
main()