-
Notifications
You must be signed in to change notification settings - Fork 89
/
Copy pathpandas_questions.py
153 lines (124 loc) · 5.79 KB
/
pandas_questions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
"""Plotting referendum results in pandas.
In short, we want to make beautiful map to report results of a referendum. In
some way, we would like to depict results with something similar to the maps
that you can find here:
https://github.com/x-datascience-datacamp/datacamp-assignment-pandas/blob/main/example_map.png
To do that, you will load the data as pandas.DataFrame, merge the info and
aggregate them by regions and finally plot them on a map using `geopandas`.
"""
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
def load_data():
"""Load data from the CSV files referundum/regions/departments."""
referendum = pd.read_csv('data/referendum.csv', sep=';')
regions = pd.read_csv('data/regions.csv', sep=',')
departments = pd.read_csv('data/departments.csv', sep=',')
return referendum, regions, departments
def merge_regions_and_departments(regions, departments):
"""Merge regions and departments in one DataFrame.
The columns in the final DataFrame should be:
['code_reg', 'name_reg', 'code_dep', 'name_dep']
"""
departments = departments.rename(columns={
"region_code": "code_reg", "code": "code_dep",
"name": "name_dep", "slug": "slug_dep"})
regions = regions.rename(columns={
"code": "code_reg",
"name": "name_reg", "slug": "slug_reg"})
regions_and_departments = pd.merge(
regions, departments,
on="code_reg")[['code_reg', 'name_reg',
'code_dep', 'name_dep']]
return regions_and_departments
def merge_referendum_and_areas(referendum, regions_and_departments):
"""Merge referendum and regions_and_departments in one DataFrame.
You can drop the lines relative to DOM-TOM-COM departments, and the
french living abroad.
"""
to_drop = ["Guadeloupe", 'Martinique',
"Guyane", "La Réunion", "Mayotte",
"Collectivités d'Outre-Mer"]
regions_and_departments = regions_and_departments.drop(
regions_and_departments[regions_and_departments["name_reg"].apply(
lambda x:x in to_drop)].index)
to_drop = ["FRANCAIS DE L'ETRANGER", 'POLYNESIE FRANCAISE', "MARTINIQUE",
"NOUVELLE CALEDONIE", 'GUADELOUPE', "LA REUNION", "GUYANE",
"MAYOTTE", 'SAINT-MARTIN/SAINT-BARTHELEMY',
'SAINT PIERRE ET MIQUELON', 'WALLIS-ET-FUTUNA']
referendum = referendum.drop(
referendum[referendum["Department name"].apply(
lambda x:x in to_drop)].index)
referendum["code_dep"] = referendum["Department code"]
idx = regions_and_departments[
regions_and_departments["code_dep"].apply(
lambda x:x.startswith('0'))].index
regions_and_departments.loc[idx, "code_dep"] = regions_and_departments.loc[
idx, "code_dep"].apply(
lambda x: x.split('0')[1])
dic_dep_codereg = regions_and_departments.groupby(
'code_dep')['code_reg'].unique().to_dict()
dic_dep_namereg = regions_and_departments.groupby(
'code_dep')['name_reg'].unique().to_dict()
dic_dep_namedep = regions_and_departments.groupby(
'code_dep')['name_dep'].unique().to_dict()
referendum_and_areas = referendum.copy()
referendum_and_areas["code_reg"] = referendum[
'Department code'].map(
dic_dep_codereg).apply(lambda x: str(x[0]))
referendum_and_areas["name_reg"] = referendum[
'Department code'].map(
dic_dep_namereg).apply(lambda x: str(x[0]))
referendum_and_areas["name_dep"] = referendum[
'Department code'].map(
dic_dep_namedep).apply(lambda x: str(x[0]))
return referendum_and_areas
def compute_referendum_result_by_regions(referendum_and_areas):
"""Return a table with the absolute count for each region.
The return DataFrame should be indexed by `code_reg` and have columns:
['name_reg', 'Registered', 'Abstentions', 'Null', 'Choice A', 'Choice B']
"""
referendum_and_areas2 = referendum_and_areas.groupby(
by="code_reg")[['name_reg', 'Registered', 'Abstentions',
"Null", 'Choice A', 'Choice B']].sum()
ids = referendum_and_areas2.index.tolist()
names = [referendum_and_areas[
referendum_and_areas["code_reg"] == str(i)][
"name_reg"].iloc[0] for i in ids]
dico = dict(zip(ids, names))
referendum_and_areas2["name_reg"] = referendum_and_areas2.index.map(dico)
referendum_result_by_regions = referendum_and_areas2[[
'name_reg', 'Registered', 'Abstentions',
'Null', 'Choice A', 'Choice B']]
return referendum_result_by_regions
def plot_referendum_map(referendum_result_by_regions):
"""Plot a map with the results from the referendum.
* Load the geographic data with geopandas from `regions.geojson`.
* Merge these info into `referendum_result_by_regions`.
* Use the method `GeoDataFrame.plot` to display the result map. The results
should display the rate of 'Choice A' over all expressed ballots.
* Return a gpd.GeoDataFrame with a column 'ratio' containing the results.
"""
dd = gpd.read_file('data/regions.geojson')
dd = dd.rename(columns={'code': "code_reg"})
dd2 = pd.merge(referendum_result_by_regions, dd, on='code_reg')
dd2["ratio"] = dd2["Choice A"].div(dd2[[
"Choice A", "Choice B"]].sum(axis=1))
dd2 = gpd.GeoDataFrame(dd2)
dd2.plot("ratio")
plt.show()
return dd2
if __name__ == "__main__":
referendum, df_reg, df_dep = load_data()
regions_and_departments = merge_regions_and_departments(
df_reg, df_dep
)
referendum_and_areas = merge_referendum_and_areas(
referendum, regions_and_departments
)
referendum_results = compute_referendum_result_by_regions(
referendum_and_areas
)
print(referendum_results)
plot_referendum_map(referendum_results)
plt.show()