Skip to content

Commit

Permalink
add .joblib to gitignore
Browse files Browse the repository at this point in the history
  • Loading branch information
LouiseDurandJanin committed Aug 21, 2023
1 parent b37c6cc commit cf833e8
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 79 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,4 @@ target/

# Mypy cache
.mypy_cache/
*.joblib
145 changes: 67 additions & 78 deletions notebooks/1.0-ldj-initial-data-exploration.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -2625,70 +2625,44 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\lenov\\anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:828: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
" warnings.warn(\n"
]
},
{
"data": {
"text/plain": [
"Index(['Num_Acc', 'id_vehicule', 'num_veh', 'senc', 'obs', 'obsm', 'choc',\n",
" 'manv', 'motor', 'occutc', 'catv_0', 'catv_1', 'catv_2', 'catv_3',\n",
" 'catv_4', 'catv_5', 'catv_6'],\n",
" dtype='object')"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"place 0\n",
"catu 0\n",
"grav 0\n",
"sexe 0\n",
"secu1 63\n",
"victim_age 0\n",
"obsm 23\n",
"motor 0\n",
"catv_0 0\n",
"catv_1 0\n",
"catv_2 0\n",
"catv_3 0\n",
"catv_4 0\n",
"catv_5 0\n",
"catv_6 0\n",
"catr 0\n",
"circ 0\n",
"surf 0\n",
"situ 0\n",
"vma 875\n",
"jour 0\n",
"mois 0\n",
"an 0\n",
"lum 0\n",
"dep 0\n",
"com 0\n",
"agg_ 0\n",
"int 0\n",
"atm 8\n",
"col 0\n",
"lat 0\n",
"long 0\n",
"hour 0\n",
"nb_victim 0\n",
"nb_vehicules 0\n",
"dtype: int64\n",
"(56518, 35)\n"
" Column Missing Count\n",
"11 circ 3057\n",
"9 motor 2157\n",
"24 col 19\n",
"12 surf 1\n",
"0 place 0\n",
"17 an 0\n",
"28 nb_victim 0\n",
"27 hour 0\n",
"26 long 0\n",
"25 lat 0\n",
"23 atm 0\n",
"22 int 0\n",
"21 agg_ 0\n",
"20 com 0\n",
"19 dep 0\n",
"18 lum 0\n",
"15 jour 0\n",
"16 mois 0\n",
"1 catu 0\n",
"14 vma 0\n",
"13 situ 0\n",
"10 catr 0\n",
"8 obsm 0\n",
"7 catv 0\n",
"6 victim_age 0\n",
"5 year_acc 0\n",
"4 secu1 0\n",
"3 sexe 0\n",
"2 grav 0\n",
"29 nb_vehicules 0\n"
]
}
],
Expand All @@ -2699,18 +2673,19 @@
"df_caract=pd.read_csv(\"C:/Users/lenov/Documents/Template_MLOps_accidents/data/raw/caracteristiques-2021.csv\", sep = \";\", encoding='utf-8')\n",
"df_veh=pd.read_csv(\"C:/Users/lenov/Documents/Template_MLOps_accidents/data/raw/vehicules-2021.csv\", sep=\";\")\n",
"\n",
"#--Creating new columns\n",
"\n",
"\n",
" #--Creating new columns\n",
"nb_victim = pd.crosstab(df_users.Num_Acc, \"count\").reset_index()\n",
"nb_vehicules = pd.crosstab(df_veh.Num_Acc, \"count\").reset_index()\n",
"df_users[\"year_acc\"] = df_users[\"Num_Acc\"].astype(str).apply(lambda x : x[:4]).astype(int)\n",
"df_users[\"victim_age\"] = df_users[\"year_acc\"]-df_users[\"an_nais\"]\n",
"for i in df_users[\"victim_age\"] :\n",
" if (i>120)|(i<0):\n",
" df_users[\"victim_age\"].replace(i,np.nan)\n",
"\n",
"df_users.drop([\"year_acc\",\"an_nais\"], inplace=True, axis=1)\n",
" df_users[\"victim_age\"].replace(i,np.nan)\n",
"df_caract[\"hour\"] = df_caract[\"hrmn\"].astype(str).apply(lambda x : x[:-3])\n",
"df_caract.drop(['hrmn'], inplace=True, axis=1)\n",
"df_users.drop(['an_nais'], inplace=True, axis=1)\n",
"\n",
"#--Replacing names \n",
"df_users.grav.replace([1,2,3,4], [1,3,4,2], inplace = True)\n",
Expand All @@ -2730,17 +2705,13 @@
"df_caract = df_caract.astype(dico_to_float)\n",
"\n",
"\n",
"#--Grouping the modalities \n",
"#--Grouping modalities \n",
"dico = {1:0, 2:1, 3:1, 4:1, 5:1, 6:1,7:1, 8:0, 9:0}\n",
"df_caract[\"atm\"] = df_caract[\"atm\"].replace(dico)\n",
"catv_value = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,30,31,32,33,34,35,36,37,38,39,40,41,42,43,50,60,80,99]\n",
"catv_value_new = [0,1,1,2,1,1,6,2,5,5,5,5,5,4,4,4,4,4,3,3,4,4,1,1,1,1,1,6,6,3,3,3,3,1,1,1,1,1,0,0]\n",
"df_veh['catv'].replace(catv_value, catv_value_new, inplace = True)\n",
"encoder = OneHotEncoder(sparse=False, drop='first')\n",
"encoded = pd.DataFrame(encoder.fit_transform(df_veh['catv'].values.reshape(-1, 1)), columns=encoder.get_feature_names_out(['catv']))\n",
"df_veh = pd.concat([df_veh, encoded], axis = 1)\n",
"df_veh.drop(\"catv\", axis=1, inplace=True)\n",
"display(df_veh.columns)\n",
"\n",
"#--Merging datasets \n",
"fusion1= df_users.merge(df_veh, on = [\"Num_Acc\",\"num_veh\", \"id_vehicule\"], how=\"inner\")\n",
"fusion1 = fusion1.sort_values(by = \"grav\", ascending = False)\n",
Expand All @@ -2754,32 +2725,50 @@
"df = df.merge(nb_vehicules, on = \"Num_Acc\", how = \"inner\") \n",
"df.rename({\"count\" :\"nb_vehicules\"},axis = 1, inplace = True)\n",
"\n",
"#--Modification of the target variable : 1 : prioritary// 0 : non-prioritary\n",
"#--Modification of the target variable : 1 : prioritary // 0 : non-prioritary\n",
"df['grav'].replace([2,3,4], [0,1,1], inplace=True)\n",
"\n",
"\n",
"#--Replacing values -1 and 0 \n",
"col_to_replace0_na = [ \"trajet\", \"motor\"]\n",
"col_to_replace1_na = [ \"trajet\", \"secu1\", \"obsm\", \"motor\", \"circ\", \"surf\", \"situ\", \"vma\", \"atm\", \"col\"]\n",
"col_to_replace0_na = [ \"trajet\", \"catv\", \"motor\"]\n",
"col_to_replace1_na = [ \"trajet\", \"secu1\", \"catv\", \"obsm\", \"motor\", \"circ\", \"surf\", \"situ\", \"vma\", \"atm\", \"col\"]\n",
"df[col_to_replace1_na] = df[col_to_replace1_na].replace(-1, np.nan)\n",
"df[col_to_replace0_na] = df[col_to_replace0_na].replace(0, np.nan)\n",
"\n",
"\n",
"#--Dropping columns \n",
"list_to_drop = ['senc','larrout','actp', 'manv', 'choc', 'nbv', 'prof', 'plan', 'Num_Acc', 'id_vehicule', 'num_veh', 'pr', 'pr1','voie', 'trajet',\"secu2\", \"secu3\",'adr', 'v1', 'lartpc','occutc','v2','vosp','locp','etatp', 'infra', 'obs' ]\n",
"df.drop(list_to_drop, axis=1, inplace=True)\n",
"missing_values = df.isna().sum()\n",
"\n",
"missing_values_sorted = missing_values.sort_values(ascending=False)\n",
"#--Dropping lines with NaN values\n",
"col_to_drop_lines = [ 'catv', 'vma', 'secu1', 'obsm', 'atm']\n",
"df = df.dropna(subset = col_to_drop_lines, axis=0)\n",
"# Calculate the number of missing values in each column\n",
"missing_values_count = df.isnull().sum()\n",
"\n",
"# Create a DataFrame to store the missing values count and column names\n",
"missing_values_df = pd.DataFrame({'Column': missing_values_count.index, 'Missing Count': missing_values_count.values})\n",
"\n",
"#--Filling NaN values\n",
"col_to_fill_na = [\"surf\", \"situ\", \"circ\", \"col\", \"motor\"]\n",
"df[col_to_fill_na] = df[col_to_fill_na].fillna(df[col_to_fill_na].mode().iloc[0])\n",
"# Sort the DataFrame by the missing values count in decreasing order\n",
"missing_values_df_sorted = missing_values_df.sort_values(by='Missing Count', ascending=False)\n",
"\n",
"# Print the sorted DataFrame\n",
"print(missing_values_df_sorted)\n",
"\n",
"\n",
"target = df['grav']\n",
"feats = df.drop(['grav'], axis = 1)\n",
"\n",
"print(df.isna().sum())\n",
"X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size=0.3, random_state = 42)\n",
"\n",
"#--Filling NaN values\n",
"col_to_fill_na = [\"surf\", \"circ\", \"col\", \"motor\"]\n",
"X_train[col_to_fill_na] = X_train[col_to_fill_na].fillna(X_train[col_to_fill_na].mode().iloc[0])\n",
"X_test[col_to_fill_na] = X_test[col_to_fill_na].fillna(X_train[col_to_fill_na].mode().iloc[0])\n",
"\n",
"print(df.shape)"
"#--Dropping last lines with NaN values\n",
"X_train = X_train.dropna(axis=0)\n",
"X_test = X_test.dropna(axis=0)"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion src/models/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
rf_classifier.fit(X_train, y_train)

#--Save the trained model to a file
model_filename = 'C:/Users/lenov/Documents/Template_MLOps_accidents/src/models/trained_model.joblib'
model_filename = '/src/models/trained_model.joblib'
joblib.dump(rf_classifier, model_filename)
print("Model trained and saved successfully.")

0 comments on commit cf833e8

Please sign in to comment.