e-mission · Abby-Wheelis · Apr 19, 2024 · Nov 15, 2024 · Nov 15, 2024 · Nov 15, 2024
diff --git a/viz_scripts/Demographics.ipynb b/viz_scripts/Demographics.ipynb
@@ -0,0 +1,392 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "420c5311",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#just for demographic charts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff7d2517",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "from sklearn import linear_model\n",
+    "\n",
+    "sns.set_style(\"whitegrid\")\n",
+    "sns.set()\n",
+    "%matplotlib inline\n",
+    "\n",
+    "params = {'legend.fontsize': 'small',\n",
+    "          'figure.figsize': (10, 8),\n",
+    "         'axes.labelsize': 'small',\n",
+    "         'axes.titlesize':'small',\n",
+    "         'xtick.labelsize':'small',\n",
+    "         'ytick.labelsize':'small'}\n",
+    "plt.rcParams.update(params)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "70064c19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#START HERE FOR JUST DEMOGRAPHICS\n",
+    "data = pd.read_csv(\"DemographicData.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aac8e5bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c1ab91d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#format the age datapoints for the chart\n",
+    "data = data.replace(['_16_years_old',\n",
+    "              '16___20_years_old',\n",
+    "              '21___25_years_old', \n",
+    "              '26___30_years_old', \n",
+    "              '31___35_years_old',\n",
+    "              '36___40_years_old',\n",
+    "              '41___45_years_old', \n",
+    "              '46___50_years_old',\n",
+    "              '51___55_years_old',\n",
+    "              '56___60_years_old'], \n",
+    "             ['<16',\n",
+    "              '16-20',\n",
+    "              '21-25',\n",
+    "              '26-30',\n",
+    "              '31-35',\n",
+    "              '36-40',\n",
+    "              '41-45',\n",
+    "              '45-50',\n",
+    "              '51-55',\n",
+    "              '56-60'])\n",
+    "\n",
+    "data['NUM_VEH'] = data['How_many_motor_vehicles_are_ow'].replace(\n",
+    "                                                ['prefer_not_to_say', 'more_than_3'], ['Prefer Not to Say', '> 3'])\n",
+    "\n",
+    "data['NUM_VEH_2'] = data['How_many_motor_vehicles_are_ow_001'].replace(\n",
+    "                                                ['prefer_not_to_say', 'more_than_3'], ['Prefer Not to Say', '> 3'])\n",
+    "\n",
+    "data['STU'] = data['Are_you_a_student'].replace(['prefer_not_to_say', 'not_a_student', 'yes___full_time_college_university', 'yes___k_12th_grade_including_ged', 'yes___part_time_college_university'], \n",
+    "                                                                ['Prefer Not to Say', 'Not a Student', 'Full-time\\nUniversity', 'K-12 Student', 'Part-time\\nUniversity'])\n",
+    "\n",
+    "data['INC'] = data['Please_identify_which_category'].replace(['prefer_not_to_say','less_than__100', '_100_to__150', '_400_or_more'],\n",
+    "                                                             ['Prefer Not to Say', '< 50', '50-100', '> 400'])\n",
+    "\n",
+    "\n",
+    "data['STU'] = pd.Categorical(data['STU'], ['Prefer Not to Say', 'Not a Student', 'K-12 Student', 'Part-time\\nUniversity', 'Full-time\\nUniversity'])\n",
+    "\n",
+    "data['HHSIZE'] = data['Including_yourself_how_many_p'].replace(\n",
+    "                                                ['prefer_not_to_say', 'more_than_7'], ['Prefer Not to Say', '> 7'])\n",
+    "\n",
+    "data['drivers'] = data['Including_yourself_how_many_p_001'].replace(\n",
+    "                                                ['prefer_not_to_say', 'more_than_4'], ['Prefer Not to Say', '> 4'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "643f8889",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Age, Income, Gender\n",
+    "plot_data = data.copy()\n",
+    "print(len(plot_data))\n",
+    "plot_data = plot_data.groupby(['user_id']).nth(0)[['What_is_your_gender', 'How_old_are_you']].dropna()\n",
+    "print(len(plot_data))\n",
+    "plot_data = plot_data[plot_data['What_is_your_gender'].isin(['man','woman'])]\n",
+    "plot_data = plot_data.groupby(['user_id'], as_index=False).nth(0)\n",
+    "\n",
+    "# print(len(plot_data)) #22 participants\n",
+    "\n",
+    "#order the values\n",
+    "plot_data = plot_data[plot_data['How_old_are_you'].isin([ '16-20',\n",
+    "                                                          '21-25',\n",
+    "                                                          '26-30',\n",
+    "                                                          '31-35',\n",
+    "                                                          '36-40',\n",
+    "                                                          '41-45',\n",
+    "                                                          '46-50',\n",
+    "                                                          '51-55',\n",
+    "                                                          '56-60'])]\n",
+    "plot_data = plot_data.sort_values(\"How_old_are_you\")\n",
+    "# plot_data['How_old_are_you'] = pd.Categorical(plot_data['How_old_are_you'], [ '<16',\n",
+    "#                                                                               '16-20',\n",
+    "#                                                                               '21-25',\n",
+    "#                                                                               '26-30',\n",
+    "#                                                                               '31-35',\n",
+    "#                                                                               '36-40',\n",
+    "#                                                                               '41-45',\n",
+    "#                                                                               '46-50',\n",
+    "#                                                                               '51-55',\n",
+    "#                                                                               '56-60'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1af714d4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_data.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "439e1d18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_data['How_old_are_you'].unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4889af86",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fecd636e",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "plot_title='Participant Demographics'\n",
+    "ylab='Count'\n",
+    "file_name='CanBikeCO_report_demog'\n",
+    "\n",
+    "fig, axs = plt.subplots(2,2,figsize=(15,8))\n",
+    "sns.histplot(data=plot_data, x='What_is_your_gender', ax=axs[0,0], color='purple', stat='probability').set(xlabel='Sex',ylabel='proportion')\n",
+    "sns.histplot(data=plot_data, x='How_old_are_you', ax=axs[0,1], color='red', stat='probability').set(xlabel='Age',ylabel='proportion')\n",
+    "plt.xticks(rotation=35, ha='right')\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "fig.savefig(file_name+\".png\", bbox_inches='tight')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "78a24604",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_data = data.copy()\n",
+    "\n",
+    "plot_data['NUM_VEH'] = pd.Categorical(plot_data['NUM_VEH'], ['Prefer Not to Say', '0', '1', '2', '3', '> 3'])\n",
+    "plot_data['NUM_VEH_2'] = pd.Categorical(plot_data['NUM_VEH_2'], ['Prefer Not to Say', '0', '1', '2', '3', '> 3'])\n",
+    "# plot_data['HHSIZE'] = pd.Categorical(plot_data['HHSIZE'], ['Prefer Not to Say', '1', '2', '3', '4', '5', '6', '7', '> 7'])\n",
+    "# plot_data['drivers'] = pd.Categorical(plot_data['drivers'], ['Prefer Not to Say', '0', '1', '2', '3', '4', '> 4'])\n",
+    "\n",
+    "# plot_data = plot_data.groupby(['user_id']).nth(0)[['NUM_VEH','NUM_VEH_2','HHSIZE', 'drivers']].dropna() #Losing 6 users!??!?\n",
+    "plot_data = plot_data.groupby(['user_id']).nth(0)[['NUM_VEH','NUM_VEH_2']].dropna() #Losing 6 users!??!?\n",
+    "plot_data = plot_data.groupby(['user_id'], as_index=False).nth(0)\n",
+    "\n",
+    "print(len(plot_data)) #22 participants"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "40de10cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plot_title='Participant Demographics'\n",
+    "ylab='Count'\n",
+    "file_name='CanBikeCO_report_demog'\n",
+    "\n",
+    "fig, axs = plt.subplots(2,2,figsize=(15,8))\n",
+    "sns.histplot(data=plot_data, x='NUM_VEH', ax=axs[0,0], color='purple', stat='probability').set(xlabel='Number of Household Vehicles with 3 or 4 wheels',ylabel='proportion')\n",
+    "sns.histplot(data=plot_data, x='NUM_VEH_2', ax=axs[0,1], color='red', stat='probability').set(xlabel='Number of Household Vehicle with 2 wheels',ylabel='proportion')\n",
+    "# sns.histplot(data=pd.DataFrame(plot_data['HHSIZE'].dropna()), x='HHSIZE', ax=axs[1,0], color='blue', stat='probability').set(xlabel='Number of People in Household',ylabel='proportion')\n",
+    "# sns.histplot(data=pd.DataFrame(plot_data['drivers'].dropna()), x='drivers', ax=axs[1,1], color='green', stat='probability').set(xlabel='Number of Drivers in Household',ylabel='proportion')\n",
+    "plt.xticks(rotation=35, ha='right')\n",
+    "plt.tight_layout()\n",
+    "\n",
+    "fig.savefig(file_name+\".png\", bbox_inches='tight')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ee301b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# challenges to travel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49ac6bfd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dem_survey_recent = data.copy()\n",
+    "dem_survey_recent = dem_survey_recent.groupby(['user_id']).nth(0)[['What_is_your_gender', 'Challenges_to_travel']].dropna()\n",
+    "print(len(dem_survey_recent)) #recall this is a new question\n",
+    "gender_counts = dem_survey_recent.groupby('What_is_your_gender').count()\n",
+    "gender_counts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f6039a5b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gender_challenge = dem_survey_recent[['What_is_your_gender', 'Challenges_to_travel']]\n",
+    "gender_challenge = gender_challenge.dropna(subset=['Challenges_to_travel','What_is_your_gender'])\n",
+    "\n",
+    "challenge_counts = {'man':{}, 'woman':{}}\n",
+    "def count_instances(row):\n",
+    "    if len(row['Challenges_to_travel']) > 0:\n",
+    "        vals = row['Challenges_to_travel'].split(' ')\n",
+    "        for item in vals:\n",
+    "            if item in challenge_counts[row['What_is_your_gender']].keys():\n",
+    "                challenge_counts[row['What_is_your_gender']][item] = challenge_counts[row['What_is_your_gender']][item] + 1\n",
+    "            else:\n",
+    "                challenge_counts[row['What_is_your_gender']][item] = 1\n",
+    "\n",
+    "gender_challenge.apply(lambda x: count_instances(x), axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "692b81e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "man_challenge_df = pd.DataFrame.from_dict(challenge_counts['man'], orient='index', columns=['man'])\n",
+    "man_challenge_df.index.names = ['Challenge']\n",
+    "man_challenge_df = man_challenge_df.reset_index()\n",
+    "man_challenge_df['Gender'] = 'Man'\n",
+    "man_challenge_df['proportion'] = man_challenge_df['man'] / np.sum(man_challenge_df['man'])\n",
+    "man_challenge_df = man_challenge_df.rename(columns={\"Challenge\": \"Challenge\", \"man\": \"Count\", \"proportion\":\"Proportion\"})\n",
+    "\n",
+    "woman_challenge_df = pd.DataFrame.from_dict(challenge_counts['woman'], orient='index', columns=['woman'])\n",
+    "woman_challenge_df.index.names = ['Challenge']\n",
+    "woman_challenge_df = woman_challenge_df.reset_index()\n",
+    "woman_challenge_df['Gender'] = 'Woman'\n",
+    "woman_challenge_df['proportion'] = woman_challenge_df['woman'] / np.sum(woman_challenge_df['woman'])\n",
+    "woman_challenge_df = woman_challenge_df.rename(columns={\"Challenge\": \"Challenge\", \"woman\": \"Count\", \"proportion\":\"Proportion\"})\n",
+    "\n",
+    "challenge_df = pd.concat([woman_challenge_df, man_challenge_df])\n",
+    "challenge_df = challenge_df.reset_index()\n",
+    "challenge_df = challenge_df.replace({'Challenge': {\n",
+    "    'high_cost_of_tickets_or_fuel': 'High Cost', \n",
+    "    'lack_of_reliability___timing_or_scheduli': 'Lack of reliability',\n",
+    "    'convenience___lack_of_connectivity_acros': 'Convenience',\n",
+    "    'risk_of_road_accidents___the_roads_are_d': 'Risk of road accidents',\n",
+    "    'personal_safety___often_feel_unsafe_whil': 'Personal safety',\n",
+    "    'none_of_the_above': 'None of the above',\n",
+    "    'access___difficulty_connecting_between_t': 'Access'\n",
+    "    }})\n",
+    "challenge_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17dd42e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "width = 0.8\n",
+    "fig, ax = plt.subplots(figsize=(10, 6))\n",
+    "running_total = [0,0]\n",
+    "fig_data = challenge_df.copy()\n",
+    "\n",
+    "for challenge in pd.unique(fig_data.Challenge):\n",
+    "    band_data = fig_data[fig_data['Challenge']==challenge]\n",
+    "    \n",
+    "    labels = band_data['Gender']\n",
+    "    vals = band_data['Proportion']*100\n",
+    "    bar_labels = band_data['Count']\n",
+    "    \n",
+    "    vals_str = [f'{y:.1f} %\\n({x:,})' if y>7 else '' for x, y in zip(bar_labels, vals)]\n",
+    "    bar = ax.barh(labels, vals, width, left=running_total, label=challenge)\n",
+    "    ax.bar_label(bar, label_type='center', labels=vals_str, rotation=90, fontsize=12)\n",
+    "    running_total[0] = running_total[0]+vals.iloc[0]\n",
+    "    running_total[1] = running_total[1]+vals.iloc[1]\n",
+    "\n",
+    "ax.set_title('Challenges for Travel', fontsize=25)\n",
+    "ax.legend(bbox_to_anchor=(1,1), fancybox=True, shadow=True, fontsize=12)\n",
+    "plt.subplots_adjust(bottom=0.20)\n",
+    "fig.tight_layout()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e815ff3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "challenge_df.groupby('Gender').sum()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.20"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}