add .joblib to gitignore

DataScientest-Studio · Aug 21, 2023 · cf833e8 · cf833e8
1 parent b37c6cc
commit cf833e8
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 79 deletions.
diff --git a/.gitignore b/.gitignore
@@ -88,3 +88,4 @@ target/
 
 # Mypy cache
 .mypy_cache/
+*.joblib
diff --git a/notebooks/1.0-ldj-initial-data-exploration.ipynb b/notebooks/1.0-ldj-initial-data-exploration.ipynb
@@ -2625,70 +2625,44 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "c:\\Users\\lenov\\anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:828: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "Index(['Num_Acc', 'id_vehicule', 'num_veh', 'senc', 'obs', 'obsm', 'choc',\n",
-       "       'manv', 'motor', 'occutc', 'catv_0', 'catv_1', 'catv_2', 'catv_3',\n",
-       "       'catv_4', 'catv_5', 'catv_6'],\n",
-       "      dtype='object')"
-      ]
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "place             0\n",
-      "catu              0\n",
-      "grav              0\n",
-      "sexe              0\n",
-      "secu1            63\n",
-      "victim_age        0\n",
-      "obsm             23\n",
-      "motor             0\n",
-      "catv_0            0\n",
-      "catv_1            0\n",
-      "catv_2            0\n",
-      "catv_3            0\n",
-      "catv_4            0\n",
-      "catv_5            0\n",
-      "catv_6            0\n",
-      "catr              0\n",
-      "circ              0\n",
-      "surf              0\n",
-      "situ              0\n",
-      "vma             875\n",
-      "jour              0\n",
-      "mois              0\n",
-      "an                0\n",
-      "lum               0\n",
-      "dep               0\n",
-      "com               0\n",
-      "agg_              0\n",
-      "int               0\n",
-      "atm               8\n",
-      "col               0\n",
-      "lat               0\n",
-      "long              0\n",
-      "hour              0\n",
-      "nb_victim         0\n",
-      "nb_vehicules      0\n",
-      "dtype: int64\n",
-      "(56518, 35)\n"
+      "          Column  Missing Count\n",
+      "11          circ           3057\n",
+      "9          motor           2157\n",
+      "24           col             19\n",
+      "12          surf              1\n",
+      "0          place              0\n",
+      "17            an              0\n",
+      "28     nb_victim              0\n",
+      "27          hour              0\n",
+      "26          long              0\n",
+      "25           lat              0\n",
+      "23           atm              0\n",
+      "22           int              0\n",
+      "21          agg_              0\n",
+      "20           com              0\n",
+      "19           dep              0\n",
+      "18           lum              0\n",
+      "15          jour              0\n",
+      "16          mois              0\n",
+      "1           catu              0\n",
+      "14           vma              0\n",
+      "13          situ              0\n",
+      "10          catr              0\n",
+      "8           obsm              0\n",
+      "7           catv              0\n",
+      "6     victim_age              0\n",
+      "5       year_acc              0\n",
+      "4          secu1              0\n",
+      "3           sexe              0\n",
+      "2           grav              0\n",
+      "29  nb_vehicules              0\n"
      ]
     }
    ],
@@ -2699,18 +2673,19 @@
     "df_caract=pd.read_csv(\"C:/Users/lenov/Documents/Template_MLOps_accidents/data/raw/caracteristiques-2021.csv\", sep = \";\", encoding='utf-8')\n",
     "df_veh=pd.read_csv(\"C:/Users/lenov/Documents/Template_MLOps_accidents/data/raw/vehicules-2021.csv\", sep=\";\")\n",
     "\n",
-    "#--Creating new columns\n",
+    "\n",
+    "\n",
+    "        #--Creating new columns\n",
     "nb_victim = pd.crosstab(df_users.Num_Acc, \"count\").reset_index()\n",
     "nb_vehicules = pd.crosstab(df_veh.Num_Acc, \"count\").reset_index()\n",
     "df_users[\"year_acc\"] = df_users[\"Num_Acc\"].astype(str).apply(lambda x : x[:4]).astype(int)\n",
     "df_users[\"victim_age\"] = df_users[\"year_acc\"]-df_users[\"an_nais\"]\n",
     "for i in df_users[\"victim_age\"] :\n",
     "    if (i>120)|(i<0):\n",
-    "        df_users[\"victim_age\"].replace(i,np.nan)\n",
-    "\n",
-    "df_users.drop([\"year_acc\",\"an_nais\"], inplace=True, axis=1)\n",
+    "            df_users[\"victim_age\"].replace(i,np.nan)\n",
     "df_caract[\"hour\"] = df_caract[\"hrmn\"].astype(str).apply(lambda x : x[:-3])\n",
     "df_caract.drop(['hrmn'], inplace=True, axis=1)\n",
+    "df_users.drop(['an_nais'], inplace=True, axis=1)\n",
     "\n",
     "#--Replacing names \n",
     "df_users.grav.replace([1,2,3,4], [1,3,4,2], inplace = True)\n",
@@ -2730,17 +2705,13 @@
     "df_caract = df_caract.astype(dico_to_float)\n",
     "\n",
     "\n",
-    "#--Grouping the modalities \n",
+    "#--Grouping modalities \n",
     "dico = {1:0, 2:1, 3:1, 4:1, 5:1, 6:1,7:1, 8:0, 9:0}\n",
     "df_caract[\"atm\"] = df_caract[\"atm\"].replace(dico)\n",
     "catv_value = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,30,31,32,33,34,35,36,37,38,39,40,41,42,43,50,60,80,99]\n",
     "catv_value_new = [0,1,1,2,1,1,6,2,5,5,5,5,5,4,4,4,4,4,3,3,4,4,1,1,1,1,1,6,6,3,3,3,3,1,1,1,1,1,0,0]\n",
     "df_veh['catv'].replace(catv_value, catv_value_new, inplace = True)\n",
-    "encoder = OneHotEncoder(sparse=False, drop='first')\n",
-    "encoded = pd.DataFrame(encoder.fit_transform(df_veh['catv'].values.reshape(-1, 1)), columns=encoder.get_feature_names_out(['catv']))\n",
-    "df_veh = pd.concat([df_veh, encoded], axis = 1)\n",
-    "df_veh.drop(\"catv\", axis=1, inplace=True)\n",
-    "display(df_veh.columns)\n",
+    "\n",
     "#--Merging datasets \n",
     "fusion1= df_users.merge(df_veh, on = [\"Num_Acc\",\"num_veh\", \"id_vehicule\"], how=\"inner\")\n",
     "fusion1 = fusion1.sort_values(by = \"grav\", ascending = False)\n",
@@ -2754,32 +2725,50 @@
     "df = df.merge(nb_vehicules, on = \"Num_Acc\", how = \"inner\") \n",
     "df.rename({\"count\" :\"nb_vehicules\"},axis = 1, inplace = True)\n",
     "\n",
-    "#--Modification of the target variable  : 1 : prioritary// 0 : non-prioritary\n",
+    "#--Modification of the target variable  : 1 : prioritary // 0 : non-prioritary\n",
     "df['grav'].replace([2,3,4], [0,1,1], inplace=True)\n",
     "\n",
     "\n",
     "#--Replacing values -1 and 0 \n",
-    "col_to_replace0_na = [ \"trajet\", \"motor\"]\n",
-    "col_to_replace1_na = [ \"trajet\", \"secu1\", \"obsm\", \"motor\", \"circ\", \"surf\", \"situ\", \"vma\", \"atm\", \"col\"]\n",
+    "col_to_replace0_na = [ \"trajet\", \"catv\", \"motor\"]\n",
+    "col_to_replace1_na = [ \"trajet\", \"secu1\", \"catv\", \"obsm\", \"motor\", \"circ\", \"surf\", \"situ\", \"vma\", \"atm\", \"col\"]\n",
     "df[col_to_replace1_na] = df[col_to_replace1_na].replace(-1, np.nan)\n",
     "df[col_to_replace0_na] = df[col_to_replace0_na].replace(0, np.nan)\n",
     "\n",
     "\n",
     "#--Dropping columns \n",
     "list_to_drop = ['senc','larrout','actp', 'manv', 'choc', 'nbv', 'prof', 'plan', 'Num_Acc', 'id_vehicule', 'num_veh', 'pr', 'pr1','voie', 'trajet',\"secu2\", \"secu3\",'adr', 'v1', 'lartpc','occutc','v2','vosp','locp','etatp', 'infra', 'obs' ]\n",
     "df.drop(list_to_drop, axis=1, inplace=True)\n",
-    "missing_values = df.isna().sum()\n",
     "\n",
-    "missing_values_sorted = missing_values.sort_values(ascending=False)\n",
+    "#--Dropping lines with NaN values\n",
+    "col_to_drop_lines = [ 'catv', 'vma', 'secu1', 'obsm', 'atm']\n",
+    "df = df.dropna(subset = col_to_drop_lines, axis=0)\n",
+    "# Calculate the number of missing values in each column\n",
+    "missing_values_count = df.isnull().sum()\n",
     "\n",
+    "# Create a DataFrame to store the missing values count and column names\n",
+    "missing_values_df = pd.DataFrame({'Column': missing_values_count.index, 'Missing Count': missing_values_count.values})\n",
     "\n",
-    "#--Filling NaN values\n",
-    "col_to_fill_na = [\"surf\", \"situ\", \"circ\", \"col\", \"motor\"]\n",
-    "df[col_to_fill_na] = df[col_to_fill_na].fillna(df[col_to_fill_na].mode().iloc[0])\n",
+    "# Sort the DataFrame by the missing values count in decreasing order\n",
+    "missing_values_df_sorted = missing_values_df.sort_values(by='Missing Count', ascending=False)\n",
+    "\n",
+    "# Print the sorted DataFrame\n",
+    "print(missing_values_df_sorted)\n",
+    "\n",
+    "\n",
+    "target = df['grav']\n",
+    "feats = df.drop(['grav'], axis = 1)\n",
     "\n",
-    "print(df.isna().sum())\n",
+    "X_train, X_test, y_train, y_test = train_test_split(feats, target, test_size=0.3, random_state = 42)\n",
+    "\n",
+    "#--Filling NaN values\n",
+    "col_to_fill_na = [\"surf\", \"circ\", \"col\", \"motor\"]\n",
+    "X_train[col_to_fill_na] = X_train[col_to_fill_na].fillna(X_train[col_to_fill_na].mode().iloc[0])\n",
+    "X_test[col_to_fill_na] = X_test[col_to_fill_na].fillna(X_train[col_to_fill_na].mode().iloc[0])\n",
     "\n",
-    "print(df.shape)"
+    "#--Dropping last lines with NaN values\n",
+    "X_train  = X_train.dropna(axis=0)\n",
+    "X_test  = X_test.dropna(axis=0)"
    ]
   },
   {

diff --git a/src/models/train_model.py b/src/models/train_model.py
@@ -16,6 +16,6 @@
 rf_classifier.fit(X_train, y_train)
 
 #--Save the trained model to a file
-model_filename = 'C:/Users/lenov/Documents/Template_MLOps_accidents/src/models/trained_model.joblib'
+model_filename = '/src/models/trained_model.joblib'
 joblib.dump(rf_classifier, model_filename)
 print("Model trained and saved successfully.")