-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine.py
336 lines (311 loc) · 15.4 KB
/
combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# IMPORTS
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import mysql.connector
import os
import warnings
from sklearn.metrics import r2_score
import datetime as dt
import json
plt.style.use('seaborn')
warnings.simplefilter(action='ignore')
# CONSTANTS
# filepaths for saving and retrieving the encoders
LOCALITY_ENCODER_PATH = 'Objects/Encoders/LabelEncoder/'
LAYOUT_ENCODER_PATH = 'Objects/Encoders/OrdinalEncoder/layout_type/'
PROPERTY_ENCODER_PATH = 'Objects/Encoders/OrdinalEncoder/property_type/'
SELLER_ENCODER_PATH = 'Objects/Encoders/OrdinalEncoder/seller_type/'
FURNITURE_ENCODER_PATH = 'Objects/Encoders/OrdinalEncoder/furniture_encoders/'
# city list
CITIES = ['AHEMDABAD','BANGALORE','CHENNAI','DELHI','HYDERABAD','KOLKATA','MUMBAI','PUNE']
# for plotting
PIE_NROWS = 2
PIE_NCOLS = 2
NUMERICAL_NROWS = 2
NUMERICAL_NCOLS = 2
REGULAR_FIGSIZE = (16,6)
PIE_FIGSIZE=(20,15)
MULTICOL_FIGSIZE=(16,8)
# for saving plots
AFFORDABILITY_PLOT_DESTINATION = 'static/Affordability/'
OTHER_PLOT_DESTINATION = 'static/Other/'
NUMERICAL_PLOT_DESTINATION = 'static/Numerical/'
SPACIOUS_PLOT_DESTINATION = 'static/Spacious/'
OVERALL_NUMERICAL_ANALYSIS_PATH = 'static/Overall'
# for saving models
RESULTS_SAVE_PATH = 'Results/'
MODEL_SAVE_PATH = 'Objects/Models/'
EVALUATION_FIGSIZE = (16,6)
EVALUATION_NROWS = 1
EVALUATION_NCOLS = 2
TEST_SIZE = 0.3
RANDOM_STATE = 42
# Initializing dictionaries
clean_df_dict = {}
preprocessed_df_dict = {}
X_train_dict = {}
y_train_dict = {}
# getting the config file
with open('config.json', 'r') as f:
config = json.load(f)
# connecting to mysql
conn = mysql.connector.connect(
host=config.get('host'),
port=config.get('port'),
user=config.get('user'),
password=config.get('password'),
database=config.get('database'),
auth_plugin=config.get('auth_plugin')
)
# reading the data and storing it in dictionaries
df_dict = dict(zip(CITIES, [pd.read_sql(f'SELECT * FROM {city}', con=conn) for city in CITIES]))
# closing the connection
conn.close()
# for storing encoders while preprocessing
locality_encoder_dict = {}
furnish_type_encoder_dict = {}
seller_type_encoder_dict = {}
layout_type_encoder_dict = {}
property_type_encoder_dict = {}
# for storing encoders while preprocessing
locality_encoder_dict = {}
furnish_type_encoder_dict = {}
seller_type_encoder_dict = {}
layout_type_encoder_dict = {}
property_type_encoder_dict = {}
def preprocess(df_dict):
"""
Cleans and preprocesses the data for future use
Cleaning:
- Removes duplicate rows (if they exist)
- Removes the city column as the dataframes are already separated
- Removes outliers on the basis of price column using lower limit as (Q1-1.5*IQR) and
upper limit as (Q3+1.5*IQR)
Preprocessing:
- Uses label encoding for locality column as order does not matter in locality
- Uses ordianal encoding for furnish_type, layout_type, seller_type and property_type as the
order matters in these columns for predicting the rent prices
Args:
df_dict - the dictionary containing cities as keys and the dataframe corresponding to them as values
Returns:
None
"""
for city, df in df_dict.items():
# Cleaning the data
# Renaming the columns
df.drop(['ID'],axis=1, inplace=True)
df.columns = ['SELLER TYPE','BEDROOM','LAYOUT TYPE','PROPERTY TYPE','LOCALITY','PRICE','AREA','FURNISH TYPE','BATHROOM']
# Dropping duplicate rows (if they exist)
df.drop_duplicates(inplace=True)
# Removing outliers
desc = df['PRICE'].describe()
q1 = desc.loc['25%']
q3 = desc.loc['75%']
iqr = q3-q1
lower_lim = q1-(1.5*iqr)
upper_lim = q3+(1.5*iqr)
df = df[(df['PRICE']>=lower_lim)&(df['PRICE']<=upper_lim)]
# Renaming the columns
cols = ['SELLER TYPE','BEDROOM','LAYOUT TYPE','PROPERTY TYPE','LOCALITY','PRICE','AREA','FURNISH TYPE','BATHROOM']
df.columns = cols
# since the data is cleaned now, we can store it in clean_dict
clean_df = df.copy()
clean_df_dict[city] = clean_df
# Preprocessing the data
locality_encoder = LabelEncoder()
df['LOCALITY'] = locality_encoder.fit_transform(df['LOCALITY'])
locality_encoder_dict[city] = locality_encoder
if not os.path.exists(LOCALITY_ENCODER_PATH):
joblib.dump(locality_encoder)
ordinal_encoder_cols = ['SELLER TYPE','LAYOUT TYPE','PROPERTY TYPE','FURNISH TYPE']
ord_enc_dict = {
'SELLER TYPE':seller_type_encoder_dict,
'LAYOUT TYPE':layout_type_encoder_dict,
'PROPERTY TYPE':property_type_encoder_dict,
'FURNISH TYPE':furnish_type_encoder_dict
}
for col in ordinal_encoder_cols:
cat = [df.groupby(by=[col])['PRICE'].mean().sort_values(ascending=True).index]
col_encoder = OrdinalEncoder(categories=cat)
df[col] = col_encoder.fit_transform(df[[col]])
ord_enc_dict[col][city] = col_encoder
preprocessed_df_dict[city] = df
paths = {
'SELLER TYPE': os.path.join(SELLER_ENCODER_PATH, f'{city}_seller_type_encoder.pkl'),
'LAYOUT TYPE': os.path.join(LAYOUT_ENCODER_PATH,f'{city}_layout_type_encoder.pkl'),
'PROPERTY TYPE': os.path.join(PROPERTY_ENCODER_PATH,f'{city}_property_type_encoder.pkl'),
'FURNISH TYPE': os.path.join(FURNITURE_ENCODER_PATH, f'{city}_furnish_type_encoder.pkl')
}
if not os.path.exists(FURNITURE_ENCODER_PATH): #check if the desired file path exists
os.makedirs(FURNITURE_ENCODER_PATH) #if not then make one
if not os.path.exists(SELLER_ENCODER_PATH):
os.makedirs(SELLER_ENCODER_PATH)
if not os.path.exists(LAYOUT_ENCODER_PATH):
os.makedirs(LAYOUT_ENCODER_PATH)
if not os.path.exists(PROPERTY_ENCODER_PATH):
os.makedirs(PROPERTY_ENCODER_PATH)
for col in ordinal_encoder_cols:
joblib.dump(ord_enc_dict[col][city], paths[col])
preprocess(df_dict)
# EDA
for city, df in clean_df_dict.items():
df['CITY'] = city
df['AFFORDABILITY'] = df['AREA']/df['PRICE']
combined = pd.concat([clean_df_dict[city] for city in CITIES])
# Overall Analysis (for home page)
# Preferred Cities
fig, ax = plt.subplots(figsize=REGULAR_FIGSIZE)
sns.countplot(x=combined['CITY'],ax=ax)
ax.set_xlabel('CITY')
ax.set_ylabel('NUMBER OF HOUSES')
ax.set_title('NUMBER OF HOUSES IN EACH CITY')
if not os.path.exists(OVERALL_NUMERICAL_ANALYSIS_PATH):
os.makedirs(OVERALL_NUMERICAL_ANALYSIS_PATH)
plt.savefig(os.path.join(OVERALL_NUMERICAL_ANALYSIS_PATH, 'n_houses.png'))
# Other columns
overall_numerical_cols = ['PRICE','AREA','AFFORDABILITY']
for col in overall_numerical_cols:
fig, ax = plt.subplots(figsize=REGULAR_FIGSIZE)
mean_df = combined.groupby(by=['CITY'])[col].mean()
mean_df.sort_values(inplace=True,ascending=False)
sns.barplot(x=mean_df.index, y=mean_df, ax=ax)
ax.set_xlabel('CITY')
ax.set_ylabel(f'AVERAGE {col}')
ax.set_title(f'AVERAGE {col} IN EACH CITY')
plt.savefig(os.path.join(OVERALL_NUMERICAL_ANALYSIS_PATH, f'{col}.png'))
# Number of houses in each city
fig, ax = plt.subplots(figsize=REGULAR_FIGSIZE)
sns.countplot(x=combined['CITY'],ax=ax,order=combined['CITY'].value_counts().index)
ax.set_title('NUMBER OF HOUSES IN EACH CITY')
ax.set_xlabel('CITY')
ax.set_ylabel('NUMBER OF HOUSES')
if not os.path.exists(OVERALL_NUMERICAL_ANALYSIS_PATH):
os.makedirs(OVERALL_NUMERICAL_ANALYSIS_PATH)
plt.savefig(os.path.join(OVERALL_NUMERICAL_ANALYSIS_PATH, 'n_houses.png'))
# Overall analysis
overall_numerical_cols = ['PRICE','AREA','AFFORDABILITY']
for col in overall_numerical_cols:
fig, ax = plt.subplots(figsize=REGULAR_FIGSIZE)
mean_df = combined.groupby(by=['CITY'])[col].mean()
mean_df.sort_values(inplace=True,ascending=False)
sns.barplot(x=mean_df.index, y=mean_df, ax=ax)
ax.set_xlabel('CITY')
ax.set_ylabel(f'AVERAGE {col}')
ax.set_title(f'AVERAGE {col} IN EACH CITY')
plt.savefig(os.path.join(OVERALL_NUMERICAL_ANALYSIS_PATH, f'{col}.png'))
# Other analysis
pie_cols = np.array(['SELLER TYPE','LAYOUT TYPE','PROPERTY TYPE','FURNISH TYPE']).reshape(PIE_NROWS,PIE_NCOLS)
for city, df in clean_df_dict.items():
fig, ax = plt.subplots(figsize=PIE_FIGSIZE,nrows=PIE_NROWS, ncols=PIE_NCOLS) #create a fig with 2 rows and 2 cols
for i in range(PIE_NROWS): #loop through the rows
for j in range(PIE_NROWS): #loop through columns
ax[i,j].pie(x=df[pie_cols[i,j]].value_counts()) #plot the pie chart
text = pd.DataFrame(df[pie_cols[i,j]].value_counts().apply(lambda x: f'{np.round((x/df.shape[0])*100,2)}%'))
# create the text to display on pie chart
text.index = text.index.str.upper() #convert text to upper case
text = text.to_string() #convert text to string
ax[i,j].text(1,0,text) #display text on pie chart
ax[i,j].set_title(f'{pie_cols[i,j]} IN {city}') #set the title
if not os.path.exists(OTHER_PLOT_DESTINATION): #check if the path exists
os.makedirs(OTHER_PLOT_DESTINATION) #if not then create the path
plt.savefig(os.path.join(OTHER_PLOT_DESTINATION, f'{city}.png')) #save the figure
# Numerical analysis
# create an array for analyzing numerical columns
numerical_cols = np.array([
['PRICE','AREA'],
['BEDROOM','BATHROOM']
]).reshape(NUMERICAL_NROWS,NUMERICAL_NCOLS)
for city, df in clean_df_dict.items():
fig, ax = plt.subplots(figsize=MULTICOL_FIGSIZE,nrows=2,ncols=2) #create a matplotlib figure
for i in range(NUMERICAL_NROWS): #loop through the rows
for j in range(NUMERICAL_NCOLS): #loop through the columns
if i==0: # if it is the 1st row
sns.histplot(df[numerical_cols[i,j]],ax=ax[i,j],kde=True) #plot the distribution of column of ith row and jth column
ax[i,j].set_title(f'DISTRIBUTION OF {numerical_cols[i,j]} IN {city}') # set the title
ax[i,j].set_xlabel(numerical_cols[i,j]) #set the xlabel
ax[i,j].set_ylabel('NUMBER OF HOUSES') #set the ylabel
if i==1: #if it is the second row
sns.countplot(x=df[numerical_cols[i,j]],ax=ax[i,j]) #plot the countplot of column of ith row and jth column
ax[i,j].set_title(f'NUMBER OF {numerical_cols[i,j]} IN THE HOUSES IN {city}') #set the title
ax[i,j].set_xlabel(numerical_cols[i,j]) #set the xlabel
ax[i,j].set_ylabel('NUMBER OF HOUSES') #set the ylabel
plt.tight_layout() #apply tight layout to prevent overlap of columns
if not os.path.exists(NUMERICAL_PLOT_DESTINATION): #check if the path exists
os.makedirs(NUMERICAL_PLOT_DESTINATION) #if not then make the path
plt.savefig(os.path.join(NUMERICAL_PLOT_DESTINATION, f'{city}.png')) #save the figure at the path
# Affordability analysis
for city, df in clean_df_dict.items():
affordable = df.groupby(by=['LOCALITY'])['AFFORDABILITY'].mean() #calculate mean area for each locality
most_affordable = affordable.sort_values(ascending=False)[:10] #sort in ascending order for most spacious
least_affordable = affordable.sort_values(ascending=True)[:10] #sort in descending order for least spacious
fig, ax = plt.subplots(figsize=MULTICOL_FIGSIZE,nrows=1,ncols=2) #create figure with 1 row and 2 cols
sns.barplot(x=least_affordable.index, y=least_affordable, ax=ax[0], order=least_affordable.index[::-1]) #plot least spacious on 1st col
ax[0].set_title(f'LEAST AFFORDABLE LOCALITIES IN {city}') #set title
ax[0].set_xlabel('LOCALITY') #set xlabel
ax[0].set_ylabel('AVERAGE AFFORDABLITY') #set ylabel
ax[0].tick_params(axis='x',labelrotation=90) #rotate the labels on x axis by 90 degrees for readibility
sns.barplot(x=most_affordable.index, y=most_affordable,ax=ax[1],order=most_affordable.index[::-1])#plot least affordable localities in 2nd column
ax[1].set_title(f'MOST AFFORDABLE LOCALITIES IN {city}')
ax[1].set_xlabel('LOCALITY') #set xlabel
ax[1].set_ylabel('AVERAGE AFFORDABLITY') #set ylabel
ax[1].tick_params(axis='x',labelrotation=90) #rotate the labels on x axis by 90 degrees for readibility
plt.tight_layout() #apply tight layout for no overlap
if not os.path.exists(AFFORDABILITY_PLOT_DESTINATION): #check if the path exists
os.makedirs(AFFORDABILITY_PLOT_DESTINATION) #if not, make the path
plt.savefig(os.path.join(AFFORDABILITY_PLOT_DESTINATION, f'{city}.png')) #save the figure
# Area analysis
for city, df in clean_df_dict.items():
spacious = df.groupby(by=['LOCALITY'])['AREA'].mean() #calculate mean area for each locality
most_spacious = spacious.sort_values(ascending=False)[:10] #sort in ascending order for most spacious
least_spacious = spacious.sort_values(ascending=True)[:10] #sort in descending order for least spacious
fig, ax = plt.subplots(figsize=MULTICOL_FIGSIZE,nrows=1,ncols=2) #create figure with 1 row and 2 cols
sns.barplot(x=least_spacious.index, y=least_spacious, ax=ax[0]) #plot least spacious on 1st col
ax[0].set_title(f'LEAST SPACIOUS LOCALITIES IN {city}') #set title
ax[0].set_xlabel('LOCALITY') #set xlabel
ax[0].set_ylabel('AVERAGE AREA IN SQUARE FEET') #set ylabel
ax[0].tick_params(axis='x',labelrotation=90) #rotate the labels on x axis by 90 degrees for readibility
sns.barplot(x=most_spacious.index, y=most_spacious,ax=ax[1])#plot least affordable localities in 2nd column
ax[1].set_title(f'MOST SPACIOUS LOCALITIES IN {city}')
ax[1].set_xlabel('LOCALITY') #set xlabel
ax[1].set_ylabel('AVERAGE AREA IN SQUARE FEET') #set ylabel
ax[1].tick_params(axis='x',labelrotation=90) #rotate the labels on x axis by 90 degrees for readibility
plt.tight_layout() #apply tight layout for no overlap
if not os.path.exists(SPACIOUS_PLOT_DESTINATION): #check if path exists
os.makedirs(SPACIOUS_PLOT_DESTINATION) #if not then create the path
plt.savefig(os.path.join(SPACIOUS_PLOT_DESTINATION, f'{city}.png')) #save the figure
train_r2_dict = {}
test_r2_dict = {}
# Model building
for city, df in preprocessed_df_dict.items():
# Grab X and y from the data
X = df.drop(['PRICE'], axis=1)
y = df['PRICE']
# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
# Since XGBoost is the best model (from models.ipynb), we will retrain a XGBRegressor model every time
model = XGBRegressor(objective='reg:squarederror')
model.fit(X_train, y_train)
# calculate r2_score for evaulation and write them to a file for evaluationn
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)
train_r2 = r2_score(y_true=y_train, y_pred=train_preds)
test_r2 = r2_score(y_true=y_test, y_pred=test_preds)
train_r2_dict[city] = train_r2
test_r2_dict[city] = test_r2
# Save the model
joblib.dump(model, os.path.join(MODEL_SAVE_PATH, f'{city}_model.pkl'))
with open('model_results.txt', 'a+') as f:
f.write(f'Ran on {dt.datetime.now()}\n')
f.write('Train r2\n')
f.write(str(train_r2_dict))
f.write('\n')
f.write('Test r2\n')
f.write(str(test_r2_dict))
f.write('\n')
print(f'Successfully executed on {dt.datetime.now()}')