-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLinearRegression.py
82 lines (79 loc) · 2.58 KB
/
LinearRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#%%
from turtle import color
from numpy import mean
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
#preprocessing data
data=pd.read_csv(r'/home/harshal/Downloads/train.csv')
print(data.info())
#cleaning
#checking if null values are present
print(data.isnull().sum())
print(data.describe())
print(data.dtypes['carat'])
#filling na values,if present
data.interpolate()
#1.feature engineering
#1.1 checking freequency of attributes
data.hist(bins=100,figsize=(20,15))
#1.2 checking the relations between each attribute
sns.pairplot(data)
#1.3 checking data correlation through heatmap with their annotations
#checking the attributes with target variable
sns.heatmap(data.corr(),annot=True)
d1=data.corr()
print(d1)
d1['price'].sort_values(ascending = False).plot(kind = 'bar');
#1.4 dropping depth as it has very poor relation with price
data=data.drop(columns=['depth'])
print(data)
#2.applying logistic regression
#2.1 getting dummy values for columns with string attributes
col=pd.get_dummies(data['color'])
ty=pd.get_dummies(data['cut'])
cla=pd.get_dummies(data['clarity'])
#2.2 dropping columns which have the least frequency
col=col.drop(['J'],axis=1)
cla=cla.drop(['I1'],axis=1)
ty=ty.drop(['Fair'],axis=1)
#2.3 concatenating new columns to data and dropping away the old ones
data=pd.concat([data,col,cla,ty],axis=1)
data=data.drop(['cut','color','clarity'],axis=1)
#2.4 Final data
print(data)
#3.0 splitting data into train and test
X=data.drop(['price'],axis=1)
y=data['price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
#3.1 creating an object clf of class LinearRegression
from sklearn.linear_model import LinearRegression
clf=LinearRegression()
#3.2 fitting the training results
clf.fit(X_train,y_train)
#3.3 predicting the test results of x
y_pred=clf.predict(X_test)
y_train=clf.predict(X_train)
# print(clf.predict([[0.136542]]))
#3.4 comparing the prediction with actual values
#function to calculate slop and y intercept
def gradient(x,y):
m=(((mean(x)*mean(y)-mean(x*y))/(mean(x)*mean(y)-mean(x*x))))
b=mean(y)-m*mean(x)
return m,b
m1,b1=gradient(y_test,y_pred)
print(m1,b1)
#4.0 plotting linear regression line
plt.scatter(y_test,y_pred,s=0.1)
plt.plot([m1*x1+b1 for x1 in range(12000)], color='red')
plt.xlim(0,35)
plt.ylim(-10,50)
#-------------------------------------------------------------------------#
testd=pd.read_csv(r'/home/harshal/Downloads/test.csv')
testd['Price']=y_pred
print(testd)
#r2 score
from sklearn.metrics import r2_score
print(r2_score(y_test,y_pred))
# %%