-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleaning.py
209 lines (155 loc) · 8.31 KB
/
cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Data Cleaning and File reading.
# To integrate the five files, we began by evaluating each in Excel to better understand the data structure and content. Upon analysis, we discovered that all files shared identical columns, with a few exceptions: some files added irrelevant columns such as Lower Whisker, Upper Whisker, and Lower Interval. To streamline the dataset, we removed the superfluous columns directly from Excel.
# Additionally, we discovered anomalies in column names across the files. For example, Happiness Score was sometimes written as Happiness.score. To solve this, we cleaned up the data and standardized the column names in Excel to increase readability and consistency.
# As the data set is given separately for 5 different years,
# we will load each dataset and concatenate them as they have same columns but for different years
# Load Datasets path.
file_path = {'2015':'2015.csv',
'2016':'2016.csv',
'2017':'2017.csv',
'2018':'2018.csv',
'2019':'2019.csv'}
#Function for loading each dataset.
def load_data(file_path):
data_frames=[]
for year,path in file_path.items():
df = pd.read_csv(path)
df['Year'] = int(year)
data_frames.append(df)
return pd.concat(data_frames, axis = 0)
#Combining all data
happiness_df = load_data(file_path)
#Displaying the combined data.
happiness_df=happiness_df.reset_index()
happiness_df=happiness_df.drop(columns = 'index')
happiness_df
#checking for null values
null_values = happiness_df.isnull().sum()
null_values
happiness_df.dtypes
happiness_df.columns
#fill the null values
happiness_df=happiness_df.fillna(0)
#Summarize statistics
stats =happiness_df.describe().drop(columns='Year')
stats
#Summarize statistics
stats =happiness_df.describe().drop(columns='Year')
stats
#Happiness score per year of each country.
Happiness_score=happiness_df.groupby(['Year','Country'])['Happiness Score'].sum()
# Distribution of Happiness score over the five years.
plt.figure(figsize=(10, 6))
sns.histplot(happiness_df['Happiness Score'], kde=True, color='skyblue')
plt.title('Distribution of Happiness Scores')
plt.xlabel('Happiness Score')
plt.show()
# Average Happiness Score by Year
Avg_Happiness_score=happiness_df.groupby('Year')['Happiness Score'].mean()
print(Avg_Happiness_score)
plt.figure(figsize=(10, 6))
sns.barplot(data=happiness_df, x="Year", y="Happiness Score")
plt.title('Average Happiness Score Over Years')
plt.show()
sns.scatterplot(x=happiness_df['Happiness Score'],y=happiness_df['Economy (GDP per Capita)'], hue="Year",data= happiness_df )
plt.show()
sns.scatterplot(x=happiness_df['Happiness Score'],y=happiness_df['Health (Life Expectancy)'], hue="Year", data= happiness_df )
plt.show()
sns.scatterplot(x=happiness_df['Happiness Score'],y=happiness_df['Social support'], hue="Year",data= happiness_df )
plt.show()
happiness_df.corr(numeric_only = True)
selected_columns = ['Happiness Score', 'Economy (GDP per Capita)', 'Social support', 'Health (Life Expectancy)']
plt.figure(figsize=(10, 8))
correlation_matrix = happiness_df[selected_columns].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title("Correlation Matrix")
plt.show()
#### Relationship happiness scores and features like GDP, social support, and life expectancy.
### As per the correlation score and the scatterplots for each we can say they have a strong positive relation, showcasing an increase with increase in the other.
#Top 10 Happy Countries overall in 5 years
Happiness_score_10=happiness_df.groupby(['Year','Country'])['Happiness Score'].sum().sort_values(ascending =False).head(10)
Happiness_score_10.plot(kind='bar',edgecolor = 'black')
plt.show()
print(Happiness_score_10)
# Top 10 happiest countries for each year
top_10_countries = (happiness_df.groupby("Year").apply(lambda x: x.nlargest(10, 'Happiness Score')))
#The nlargest(10, 'Happiness Score') method selects the top 10 rows from x based on the column 'Happiness Score'.
print("Top 10 Happiest Countries Each Year:",'\n')
print(top_10_countries[['Country', 'Happiness Score']])
top_10_countries.set_index('Country')['Happiness Score'].plot(kind='bar',edgecolor = 'black', figsize= (12,8))
plt.show()
# Calculate the mean contribution of factors for each year
years = [2019, 2018, 2017, 2016, 2015]
factor_contributions = {}
for year in years:
factor_contributions[year] = happiness_df[happiness_df['Year'] == year][[
'Economy (GDP per Capita)', 'Social support', 'Health (Life Expectancy)',
'Freedom', 'Trust (Government Corruption)', 'Generosity'
]].mean()
# Create subplots for all years
plt.figure(figsize=(20, 5)) # Adjust figure size for better visibility
for i, year in enumerate(years, 1):
plt.subplot(1, 5, i)
factor_contributions[year].plot(kind='bar', color='teal')
plt.title(f'Factors Contribution to Happiness ({year})')
plt.ylabel('Mean Value')
plt.xticks(rotation=45) # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent overlap
plt.show()
## 1. Insights for NGOs
### NGOs focus on factors related to social well-being, such as Social Support, Generosity, and Healthy Life Expectancy. The goal is to identify countries with low scores in these areas and provide recommendations.
# Average social support, generosity, and healthy life expectancy by country
ngo_factors = happiness_df.groupby('Country')[['Social support', 'Generosity', 'Health (Life Expectancy)']].mean()
# Sort countries by the lowest average scores for social support and healthy life expectancy
low_social_support = ngo_factors.sort_values('Social support').head(10)
low_life_expectancy = ngo_factors.sort_values('Health (Life Expectancy)').head(10)
print("Top 10 Countries with Lowest Social Support:")
print(low_social_support)
print("\nTop 10 Countries with Lowest Healthy Life Expectancy:")
print(low_life_expectancy)
# Visualization: Low social support countries
low_social_support.plot(kind='barh', y='Social support', legend=False, color='orange', figsize=(8, 5))
plt.title('Countries with Lowest Social Support')
plt.xlabel('Social Support Score')
plt.ylabel('Country')
plt.show()
# Visualization: Low healthy life expectancy countries
low_life_expectancy.plot(kind='barh', y='Health (Life Expectancy)', legend=False, color='teal', figsize=(8, 5))
plt.title('Countries with Lowest Healthy Life Expectancy')
plt.xlabel('Healthy Life Expectancy')
plt.ylabel('Country')
plt.show()
# Countries with high Social support but low happiness
high_social_low_happiness = happiness_df[(happiness_df['Social support'] > 1) & (happiness_df['Happiness Score'] < 5)]
print("Countries with High Social Support but Low Happiness:")
print(high_gdp_low_happiness[['Country', 'Social support', 'Happiness Score']])
## 2. Insights for Governments
### Governments are typically more interested in broader economic and societal factors like GDP per Capita, Happiness Score, and the correlation between these factors. The focus is to identify key drivers of happiness for policy planning.
# Correlation between key factors and happiness score
gov_factors = happiness_df[['Happiness Score', 'Economy (GDP per Capita)', 'Health (Life Expectancy)']].corr()
print("Correlation Between Key Factors and Happiness Score:")
print(gov_factors['Happiness Score'])
# Visualization: Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(gov_factors, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Between Key Factors')
plt.show()
# Countries with high GDP but low happiness (potential policy gaps)
high_gdp_low_happiness = happiness_df[(happiness_df['Economy (GDP per Capita)'] > 1) & (happiness_df['Happiness Score'] < 5)]
print("Countries with High GDP but Low Happiness:")
print(high_gdp_low_happiness[['Country', 'Economy (GDP per Capita)', 'Happiness Score']])
# Visualization: High GDP vs Low Happiness
plt.figure(figsize=(8, 5))
sns.scatterplot(data=happiness_df, x='Economy (GDP per Capita)', y='Happiness Score', hue='Year', palette='viridis')
plt.title('GDP per Capita vs Happiness Score')
plt.xlabel('GDP per Capita')
plt.ylabel('Happiness Score')
plt.axhline(5, color='red', linestyle='--', label='Low Happiness Threshold')
plt.legend()
plt.show()
# Getting the cleaned and combined data set for streamlit app.
happiness_df.to_csv('World_happiness_report.csv', index=False)