Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Charts for USAID LaosEV Data Analysis #175

Draft
wants to merge 13 commits into
base: main
Choose a base branch
from
392 changes: 392 additions & 0 deletions viz_scripts/Demographics.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,392 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "420c5311",
"metadata": {},
"outputs": [],
"source": [
"#just for demographic charts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff7d2517",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"from sklearn import linear_model\n",
"\n",
"sns.set_style(\"whitegrid\")\n",
"sns.set()\n",
"%matplotlib inline\n",
"\n",
"params = {'legend.fontsize': 'small',\n",
" 'figure.figsize': (10, 8),\n",
" 'axes.labelsize': 'small',\n",
" 'axes.titlesize':'small',\n",
" 'xtick.labelsize':'small',\n",
" 'ytick.labelsize':'small'}\n",
"plt.rcParams.update(params)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70064c19",
"metadata": {},
"outputs": [],
"source": [
"#START HERE FOR JUST DEMOGRAPHICS\n",
"data = pd.read_csv(\"DemographicData.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aac8e5bf",
"metadata": {},
"outputs": [],
"source": [
"data.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c1ab91d",
"metadata": {},
"outputs": [],
"source": [
"#format the age datapoints for the chart\n",
"data = data.replace(['_16_years_old',\n",
" '16___20_years_old',\n",
" '21___25_years_old', \n",
" '26___30_years_old', \n",
" '31___35_years_old',\n",
" '36___40_years_old',\n",
" '41___45_years_old', \n",
" '46___50_years_old',\n",
" '51___55_years_old',\n",
" '56___60_years_old'], \n",
" ['<16',\n",
" '16-20',\n",
" '21-25',\n",
" '26-30',\n",
" '31-35',\n",
" '36-40',\n",
" '41-45',\n",
" '45-50',\n",
" '51-55',\n",
" '56-60'])\n",
"\n",
"data['NUM_VEH'] = data['How_many_motor_vehicles_are_ow'].replace(\n",
" ['prefer_not_to_say', 'more_than_3'], ['Prefer Not to Say', '> 3'])\n",
"\n",
"data['NUM_VEH_2'] = data['How_many_motor_vehicles_are_ow_001'].replace(\n",
" ['prefer_not_to_say', 'more_than_3'], ['Prefer Not to Say', '> 3'])\n",
"\n",
"data['STU'] = data['Are_you_a_student'].replace(['prefer_not_to_say', 'not_a_student', 'yes___full_time_college_university', 'yes___k_12th_grade_including_ged', 'yes___part_time_college_university'], \n",
" ['Prefer Not to Say', 'Not a Student', 'Full-time\\nUniversity', 'K-12 Student', 'Part-time\\nUniversity'])\n",
"\n",
"data['INC'] = data['Please_identify_which_category'].replace(['prefer_not_to_say','less_than__100', '_100_to__150', '_400_or_more'],\n",
" ['Prefer Not to Say', '< 50', '50-100', '> 400'])\n",
"\n",
"\n",
"data['STU'] = pd.Categorical(data['STU'], ['Prefer Not to Say', 'Not a Student', 'K-12 Student', 'Part-time\\nUniversity', 'Full-time\\nUniversity'])\n",
"\n",
"data['HHSIZE'] = data['Including_yourself_how_many_p'].replace(\n",
" ['prefer_not_to_say', 'more_than_7'], ['Prefer Not to Say', '> 7'])\n",
"\n",
"data['drivers'] = data['Including_yourself_how_many_p_001'].replace(\n",
" ['prefer_not_to_say', 'more_than_4'], ['Prefer Not to Say', '> 4'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "643f8889",
"metadata": {},
"outputs": [],
"source": [
"# Age, Income, Gender\n",
"plot_data = data.copy()\n",
"print(len(plot_data))\n",
"plot_data = plot_data.groupby(['user_id']).nth(0)[['What_is_your_gender', 'How_old_are_you']].dropna()\n",
"print(len(plot_data))\n",
"plot_data = plot_data[plot_data['What_is_your_gender'].isin(['man','woman'])]\n",
"plot_data = plot_data.groupby(['user_id'], as_index=False).nth(0)\n",
"\n",
"# print(len(plot_data)) #22 participants\n",
"\n",
"#order the values\n",
"plot_data = plot_data[plot_data['How_old_are_you'].isin([ '16-20',\n",
" '21-25',\n",
" '26-30',\n",
" '31-35',\n",
" '36-40',\n",
" '41-45',\n",
" '46-50',\n",
" '51-55',\n",
" '56-60'])]\n",
"plot_data = plot_data.sort_values(\"How_old_are_you\")\n",
"# plot_data['How_old_are_you'] = pd.Categorical(plot_data['How_old_are_you'], [ '<16',\n",
"# '16-20',\n",
"# '21-25',\n",
"# '26-30',\n",
"# '31-35',\n",
"# '36-40',\n",
"# '41-45',\n",
"# '46-50',\n",
"# '51-55',\n",
"# '56-60'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1af714d4",
"metadata": {},
"outputs": [],
"source": [
"plot_data.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "439e1d18",
"metadata": {},
"outputs": [],
"source": [
"plot_data['How_old_are_you'].unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4889af86",
"metadata": {},
"outputs": [],
"source": [
"plot_data.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fecd636e",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"plot_title='Participant Demographics'\n",
"ylab='Count'\n",
"file_name='CanBikeCO_report_demog'\n",
"\n",
"fig, axs = plt.subplots(2,2,figsize=(15,8))\n",
"sns.histplot(data=plot_data, x='What_is_your_gender', ax=axs[0,0], color='purple', stat='probability').set(xlabel='Sex',ylabel='proportion')\n",
"sns.histplot(data=plot_data, x='How_old_are_you', ax=axs[0,1], color='red', stat='probability').set(xlabel='Age',ylabel='proportion')\n",
"plt.xticks(rotation=35, ha='right')\n",
"plt.tight_layout()\n",
"\n",
"fig.savefig(file_name+\".png\", bbox_inches='tight')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "78a24604",
"metadata": {},
"outputs": [],
"source": [
"plot_data = data.copy()\n",
"\n",
"plot_data['NUM_VEH'] = pd.Categorical(plot_data['NUM_VEH'], ['Prefer Not to Say', '0', '1', '2', '3', '> 3'])\n",
"plot_data['NUM_VEH_2'] = pd.Categorical(plot_data['NUM_VEH_2'], ['Prefer Not to Say', '0', '1', '2', '3', '> 3'])\n",
"# plot_data['HHSIZE'] = pd.Categorical(plot_data['HHSIZE'], ['Prefer Not to Say', '1', '2', '3', '4', '5', '6', '7', '> 7'])\n",
"# plot_data['drivers'] = pd.Categorical(plot_data['drivers'], ['Prefer Not to Say', '0', '1', '2', '3', '4', '> 4'])\n",
"\n",
"# plot_data = plot_data.groupby(['user_id']).nth(0)[['NUM_VEH','NUM_VEH_2','HHSIZE', 'drivers']].dropna() #Losing 6 users!??!?\n",
"plot_data = plot_data.groupby(['user_id']).nth(0)[['NUM_VEH','NUM_VEH_2']].dropna() #Losing 6 users!??!?\n",
"plot_data = plot_data.groupby(['user_id'], as_index=False).nth(0)\n",
"\n",
"print(len(plot_data)) #22 participants"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "40de10cb",
"metadata": {},
"outputs": [],
"source": [
"plot_title='Participant Demographics'\n",
"ylab='Count'\n",
"file_name='CanBikeCO_report_demog'\n",
"\n",
"fig, axs = plt.subplots(2,2,figsize=(15,8))\n",
"sns.histplot(data=plot_data, x='NUM_VEH', ax=axs[0,0], color='purple', stat='probability').set(xlabel='Number of Household Vehicles with 3 or 4 wheels',ylabel='proportion')\n",
"sns.histplot(data=plot_data, x='NUM_VEH_2', ax=axs[0,1], color='red', stat='probability').set(xlabel='Number of Household Vehicle with 2 wheels',ylabel='proportion')\n",
"# sns.histplot(data=pd.DataFrame(plot_data['HHSIZE'].dropna()), x='HHSIZE', ax=axs[1,0], color='blue', stat='probability').set(xlabel='Number of People in Household',ylabel='proportion')\n",
"# sns.histplot(data=pd.DataFrame(plot_data['drivers'].dropna()), x='drivers', ax=axs[1,1], color='green', stat='probability').set(xlabel='Number of Drivers in Household',ylabel='proportion')\n",
"plt.xticks(rotation=35, ha='right')\n",
"plt.tight_layout()\n",
"\n",
"fig.savefig(file_name+\".png\", bbox_inches='tight')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ee301b2",
"metadata": {},
"outputs": [],
"source": [
"# challenges to travel"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "49ac6bfd",
"metadata": {},
"outputs": [],
"source": [
"dem_survey_recent = data.copy()\n",
"dem_survey_recent = dem_survey_recent.groupby(['user_id']).nth(0)[['What_is_your_gender', 'Challenges_to_travel']].dropna()\n",
"print(len(dem_survey_recent)) #recall this is a new question\n",
"gender_counts = dem_survey_recent.groupby('What_is_your_gender').count()\n",
"gender_counts"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f6039a5b",
"metadata": {},
"outputs": [],
"source": [
"gender_challenge = dem_survey_recent[['What_is_your_gender', 'Challenges_to_travel']]\n",
"gender_challenge = gender_challenge.dropna(subset=['Challenges_to_travel','What_is_your_gender'])\n",
"\n",
"challenge_counts = {'man':{}, 'woman':{}}\n",
"def count_instances(row):\n",
" if len(row['Challenges_to_travel']) > 0:\n",
" vals = row['Challenges_to_travel'].split(' ')\n",
" for item in vals:\n",
" if item in challenge_counts[row['What_is_your_gender']].keys():\n",
" challenge_counts[row['What_is_your_gender']][item] = challenge_counts[row['What_is_your_gender']][item] + 1\n",
" else:\n",
" challenge_counts[row['What_is_your_gender']][item] = 1\n",
"\n",
"gender_challenge.apply(lambda x: count_instances(x), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "692b81e1",
"metadata": {},
"outputs": [],
"source": [
"man_challenge_df = pd.DataFrame.from_dict(challenge_counts['man'], orient='index', columns=['man'])\n",
"man_challenge_df.index.names = ['Challenge']\n",
"man_challenge_df = man_challenge_df.reset_index()\n",
"man_challenge_df['Gender'] = 'Man'\n",
"man_challenge_df['proportion'] = man_challenge_df['man'] / np.sum(man_challenge_df['man'])\n",
"man_challenge_df = man_challenge_df.rename(columns={\"Challenge\": \"Challenge\", \"man\": \"Count\", \"proportion\":\"Proportion\"})\n",
"\n",
"woman_challenge_df = pd.DataFrame.from_dict(challenge_counts['woman'], orient='index', columns=['woman'])\n",
"woman_challenge_df.index.names = ['Challenge']\n",
"woman_challenge_df = woman_challenge_df.reset_index()\n",
"woman_challenge_df['Gender'] = 'Woman'\n",
"woman_challenge_df['proportion'] = woman_challenge_df['woman'] / np.sum(woman_challenge_df['woman'])\n",
"woman_challenge_df = woman_challenge_df.rename(columns={\"Challenge\": \"Challenge\", \"woman\": \"Count\", \"proportion\":\"Proportion\"})\n",
"\n",
"challenge_df = pd.concat([woman_challenge_df, man_challenge_df])\n",
"challenge_df = challenge_df.reset_index()\n",
"challenge_df = challenge_df.replace({'Challenge': {\n",
" 'high_cost_of_tickets_or_fuel': 'High Cost', \n",
" 'lack_of_reliability___timing_or_scheduli': 'Lack of reliability',\n",
" 'convenience___lack_of_connectivity_acros': 'Convenience',\n",
" 'risk_of_road_accidents___the_roads_are_d': 'Risk of road accidents',\n",
" 'personal_safety___often_feel_unsafe_whil': 'Personal safety',\n",
" 'none_of_the_above': 'None of the above',\n",
" 'access___difficulty_connecting_between_t': 'Access'\n",
" }})\n",
"challenge_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "17dd42e2",
"metadata": {},
"outputs": [],
"source": [
"width = 0.8\n",
"fig, ax = plt.subplots(figsize=(10, 6))\n",
"running_total = [0,0]\n",
"fig_data = challenge_df.copy()\n",
"\n",
"for challenge in pd.unique(fig_data.Challenge):\n",
" band_data = fig_data[fig_data['Challenge']==challenge]\n",
" \n",
" labels = band_data['Gender']\n",
" vals = band_data['Proportion']*100\n",
" bar_labels = band_data['Count']\n",
" \n",
" vals_str = [f'{y:.1f} %\\n({x:,})' if y>7 else '' for x, y in zip(bar_labels, vals)]\n",
" bar = ax.barh(labels, vals, width, left=running_total, label=challenge)\n",
" ax.bar_label(bar, label_type='center', labels=vals_str, rotation=90, fontsize=12)\n",
" running_total[0] = running_total[0]+vals.iloc[0]\n",
" running_total[1] = running_total[1]+vals.iloc[1]\n",
"\n",
"ax.set_title('Challenges for Travel', fontsize=25)\n",
"ax.legend(bbox_to_anchor=(1,1), fancybox=True, shadow=True, fontsize=12)\n",
"plt.subplots_adjust(bottom=0.20)\n",
"fig.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1e815ff3",
"metadata": {},
"outputs": [],
"source": [
"challenge_df.groupby('Gender').sum()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.20"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Loading