-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset_generator.py
33 lines (25 loc) · 1.08 KB
/
dataset_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
import pandas as pd
n_samples = 500 # number of samples
noise = 1 # bias and standard deviation of the guassian noise
x, y = datasets.make_regression(
n_samples=n_samples, # number of samples
n_features=1, # number of features
n_informative=1, # number of useful features
noise=noise, # bias and standard deviation of the guassian noise
random_state=10, # random seed
) # set for same data points for each run
# Scale Y axis (YoX) to 0.1 to 20 Years to 2DP
y = np.around(np.interp(y, (y.min(), y.max()), (0.1, 20)), decimals=2)
# Scale X axis (salary) to range 20000..150000
x = np.around(np.interp(x, (x.min(), x.max()), (20000, 150000)), decimals=2)
plt.plot(x, y, ".", label="training data")
plt.xlabel("Salary (£)")
plt.ylabel("Years of experience Y")
plt.title("Experience Vs. Salary")
plt.show()
# Create a dataframe with the x and y values
df = pd.DataFrame({"YearsExperience": y, "Salary": x.flatten()})
df.to_csv("./LinearRegression/Data/Custom_Salary_Data.csv", index=False)