-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
179 lines (150 loc) · 6.33 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from flask import Flask, request, jsonify
from flask_cors import CORS
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import os
import time
import numpy as np
import pandas as pd
from PIL import Image
from keras.applications.resnet50 import preprocess_input, ResNet50
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import GlobalMaxPooling2D
from sklearn.metrics.pairwise import pairwise_distances
import json
app = Flask(__name__)
CORS(app)
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
def scroll_to_bottom(driver):
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
def clear_images_folder():
folder = 'scraped_images'
if os.path.exists(folder):
for filename in os.listdir(folder):
file_path = os.path.join(folder, filename)
try:
if os.path.isfile(file_path) or os.path.islink(file_path):
os.unlink(file_path)
elif os.path.isdir(file_path):
os.rmdir(file_path)
except Exception as e:
print(f'Failed to delete {file_path}. Reason: {e}')
def scrape_current_page_images(driver, max_images=10):
clear_images_folder()
scroll_to_bottom(driver)
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, 'body')))
soup = BeautifulSoup(driver.page_source, 'html.parser')
images = soup.find_all('img')
folder_path = os.path.join(os.getcwd(), 'scraped_images')
if not os.path.exists(folder_path):
os.makedirs(folder_path)
print(f"Created folder at: {folder_path}")
else:
print(f"Folder already exists at: {folder_path}")
downloaded_count = 0
for index, img in enumerate(images):
if downloaded_count >= max_images:
print(f"Reached the maximum limit of {max_images} images.")
break
img_url = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
if img_url and img_url.startswith(('http', 'https')):
try:
img_data = requests.get(img_url).content
with open(f'scraped_images/image_{index + 1}.jpg', 'wb') as handler:
handler.write(img_data)
downloaded_count += 1
print(f"Downloaded image_{index + 1}.jpg")
except Exception as e:
print(f"Failed to download image {img_url}: {e}")
def load_image_from_path(image_path):
try:
img = Image.open(image_path).convert('RGB')
img = img.resize((224, 224)) # Resize image to fit ResNet50 input size
img_array = np.array(img)
return img, img_array
except Exception as e:
print(f"Error loading image from {image_path}: {e}")
return None, None
def embeddings(model, img_array):
x = np.expand_dims(img_array, axis=0)
x = preprocess_input(x)
return model.predict(x).reshape(-1)
def calculate_cosine_similarity(feature_vector, df_embs):
feature_vector = feature_vector.reshape(1, -1)
# Calculate cosine similarity
cosine_similarity = 1 - pairwise_distances(df_embs, feature_vector, metric='cosine').flatten()
return cosine_similarity
def get_recommendations(feature_vector, df_embs, top_n=5):
# Calculate cosine similarity scores
sim_scores = calculate_cosine_similarity(feature_vector, df_embs)
sim_scores = sim_scores.flatten()
# Sort the similarity scores in descending order
sim_scores_indices = np.argsort(sim_scores)[::-1]
idx_rec = sim_scores_indices[:top_n]
idx_sim = sim_scores[idx_rec]
return idx_rec, idx_sim
def recommender_folder(folder_path, df, df_embs, model, top_n=5):
recommendations = []
# Iterate through each image in the folder
for filename in os.listdir(folder_path):
if filename.endswith(".jpg") or filename.endswith(".png"):
image_path = os.path.join(folder_path, filename)
img, img_array = load_image_from_path(image_path)
if img is None:
continue
ref_features = embeddings(model, img_array)
idx_rec, idx_sim = get_recommendations(ref_features, df_embs, top_n=top_n)
# Collect recommendations
for i, row_idx in enumerate(idx_rec):
product_title = df.iloc[row_idx]['title']
product_url = df.iloc[row_idx]['link']
recommendations.append({
"title": product_title,
"link": product_url
})
else:
continue
print(recommendations)
return recommendations
@app.route('/scrape', methods=['POST'])
def scrape():
data = request.get_json()
url = data['url']
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=options)
driver.get(url)
MAX_IMAGES_TO_DOWNLOAD = 10
scrape_current_page_images(driver, max_images=MAX_IMAGES_TO_DOWNLOAD)
driver.quit()
df = pd.read_csv("recommendation/processed_myntraDataset.csv")
df_embs = pd.read_csv('recommendation/image_embeddings.csv', header=None).values
# Initialize the ResNet50 model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False
model = Sequential([
base_model,
GlobalMaxPooling2D()
])
folder_path = './scraped_images'
# Get recommendations
recommendations = recommender_folder(folder_path, df, df_embs, model, top_n=1)
return jsonify(recommendations)
if __name__ == '__main__':
app.run(debug=True)