Chap 03 - Révision des exemples selon DC

calculquebec · Jan 31, 2025 · b20a69e · b20a69e
1 parent f26e531
commit b20a69e
Show file tree

Hide file tree

Showing 5 changed files with 195 additions and 117 deletions.
diff --git a/en/03-format.ipynb b/en/03-format.ipynb
@@ -59,17 +59,32 @@
     "### Checking the format of our data"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "888faaed-8326-47b3-b9e7-7f24a6af09ac",
+   "metadata": {
+    "lang": "en"
+   },
+   "source": [
+    "Native Python Type | Pandas Type | Description\n",
+    ":-----------------:|:-----------:|:-----------\n",
+    "`str`              | `object`    | The most general dtype. Will be assigned to your column if column has mixed types (numbers and strings).\n",
+    "`int`              | `int64`     | 64 bits integer\n",
+    "`float`            | `float64`   | Numeric characters with decimals. If a column contains numbers and NaNs(see below), pandas will default to float64.\n",
+    " N/A               | `datetime64`| Values meant to hold time data."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "52b2c88d-886b-4369-b632-d5302b48f303",
+   "id": "a7fc2dd4-ccf3-44a5-a9be-e032b426f937",
    "metadata": {
     "lang": "en"
    },
    "outputs": [],
    "source": [
-    "# Getting the data types of all columns\n",
-    "surveys_df.dtypes"
+    "# Getting the data type of species identifiers\n",
+    "surveys_df['species_id'].dtype"
    ]
   },
   {
@@ -81,23 +96,21 @@
    },
    "outputs": [],
    "source": [
-    "# Getting the data type of a single column\n",
+    "# Getting the data type of month values\n",
     "surveys_df['month'].dtype"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "888faaed-8326-47b3-b9e7-7f24a6af09ac",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52b2c88d-886b-4369-b632-d5302b48f303",
    "metadata": {
     "lang": "en"
    },
+   "outputs": [],
    "source": [
-    "Native Python Type | Pandas Type | Description\n",
-    ":-----------------:|:-----------:|:-----------\n",
-    "`str`              | `object`    | The most general dtype. Will be assigned to your column if column has mixed types (numbers and strings).\n",
-    "`int`              | `int64`     | 64 bits integer\n",
-    "`float`            | `float64`   | Numeric characters with decimals. If a column contains numbers and NaNs(see below), pandas will default to float64.\n",
-    " N/A               | `datetime64`| Values meant to hold time data."
+    "# Getting the data types of all columns\n",
+    "surveys_df.dtypes"
    ]
   },
   {
@@ -321,7 +334,7 @@
    "outputs": [],
    "source": [
     "# Create a copy to avoid modifying the original object\n",
-    "copy_surveys_df = surveys_df.copy()"
+    "copy_surveys = surveys_df.copy()"
    ]
   },
   {
@@ -334,8 +347,8 @@
    "outputs": [],
    "source": [
     "# For a stable mean value\n",
-    "averageW = copy_surveys_df['weight'].mean()\n",
-    "copy_surveys_df['weight'] = copy_surveys_df['weight'].fillna(averageW)"
+    "copy_surveys['weight'] = copy_surveys['weight'].fillna(\n",
+    "    copy_surveys['weight'].mean())"
    ]
   },
   {
@@ -348,7 +361,7 @@
    "outputs": [],
    "source": [
     "# After the cleanup\n",
-    "print(copy_surveys_df['weight'].count(), copy_surveys_df['weight'].mean())"
+    "print(copy_surveys['weight'].count(), copy_surveys['weight'].mean())"
    ]
   },
   {
@@ -361,8 +374,8 @@
    "outputs": [],
    "source": [
     "# Can we now convert weight values to integers?\n",
-    "copy_surveys_df['weight'] = copy_surveys_df['weight'].astype('int64')\n",
-    "copy_surveys_df['weight'].mean()"
+    "copy_surveys['weight'] = copy_surveys['weight'].astype('int64')\n",
+    "copy_surveys['weight'].mean()"
    ]
   },
   {
@@ -373,7 +386,7 @@
    },
    "source": [
     "### Exercise - Data Cleanup\n",
-    "In the `sex` column of `copy_surveys_df`:\n",
+    "In the `sex` column of `copy_surveys`:\n",
     "* Replace undefined values by `'F|M'`\n",
     "* Any value not equal to `'F'`, `'M'` or `'F|M'` is\n",
     "  considered invalid and must be replaced by `'F|M'`\n",
@@ -394,16 +407,16 @@
    "outputs": [],
    "source": [
     "# Create invalid data\n",
-    "copy_surveys_df.loc[::123, 'sex'] = 'NA'\n",
+    "copy_surveys.loc[::123, 'sex'] = 'NA'\n",
     "\n",
     "# Replace undefined values\n",
-    "#copy_surveys_df['sex'] = ###\n",
+    "#copy_surveys['sex'] = ###\n",
     "\n",
     "# Replace invalid values\n",
     "#invalid_rows = ###.isin(['F', 'F|M', 'M'])\n",
-    "#copy_surveys_df.loc[invalid_rows, ###] = ###\n",
+    "#copy_surveys.loc[invalid_rows, ###] = ###\n",
     "\n",
-    "copy_surveys_df['sex'].unique()"
+    "copy_surveys['sex'].unique()"
    ]
   },
   {
@@ -426,7 +439,7 @@
    "outputs": [],
    "source": [
     "# Only keep (complete) records that have no NA\n",
-    "df_no_na = copy_surveys_df.dropna()\n",
+    "df_no_na = copy_surveys.dropna()\n",
     "df_no_na"
    ]
   },

diff --git a/fr/03-format.ipynb b/fr/03-format.ipynb
@@ -59,17 +59,32 @@
     "### Vérifier le type de données pour chaque colonne"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "bff92e0f-ea1a-45c2-b6cb-ce919ac96cf1",
+   "metadata": {
+    "lang": "fr"
+   },
+   "source": [
+    "Types Python | Types Pandas | Description\n",
+    ":-----------:|:------------:|:-----------\n",
+    "`str`        | `object`     | Type générique, aussi utilisé en cas de multiples types\n",
+    "`int`        | `int64`      | Nombres entiers représentés avec 64 bits\n",
+    "`float`      | `float64`    | Nombres réels représentés avec 64 bits, ou non-définis (NaN)\n",
+    " N/A         | `datetime64` | Dates et heures, avec une précision allant jusqu'à la nanoseconde"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f5260077-a222-40b6-b621-4764bc78df71",
+   "id": "593c6c69-8f03-40e5-a485-39e8f68aa19f",
    "metadata": {
     "lang": "fr"
    },
    "outputs": [],
    "source": [
-    "# Obtenir le type de données pour chaque colonne\n",
-    "surveys_df.dtypes"
+    "# Le type de données pour les identifiants d'espèces\n",
+    "surveys_df['species_id'].dtype"
    ]
   },
   {
@@ -86,18 +101,16 @@
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "bff92e0f-ea1a-45c2-b6cb-ce919ac96cf1",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f5260077-a222-40b6-b621-4764bc78df71",
    "metadata": {
     "lang": "fr"
    },
+   "outputs": [],
    "source": [
-    "Types Python | Types Pandas | Description\n",
-    ":-----------:|:------------:|:-----------\n",
-    "`str`        | `object`     | Type générique, aussi utilisé en cas de multiples types\n",
-    "`int`        | `int64`      | Nombres entiers représentés avec 64 bits\n",
-    "`float`      | `float64`    | Nombres réels représentés avec 64 bits, ou non-définis (NaN)\n",
-    " N/A         | `datetime64` | Dates et heures, avec une précision allant jusqu'à la nanoseconde"
+    "# Obtenir le type de données pour chaque colonne\n",
+    "surveys_df.dtypes"
    ]
   },
   {
@@ -334,8 +347,8 @@
    "outputs": [],
    "source": [
     "# Pour une valeur moyenne stable\n",
-    "moyennePoids = copie_surveys['weight'].mean()\n",
-    "copie_surveys['weight'] = copie_surveys['weight'].fillna(moyennePoids)"
+    "copie_surveys['weight'] = copie_surveys['weight'].fillna(\n",
+    "    copie_surveys['weight'].mean())"
    ]
   },
   {

diff --git a/solution-en/03-format.ipynb b/solution-en/03-format.ipynb
@@ -59,17 +59,32 @@
     "### Checking the format of our data"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "888faaed-8326-47b3-b9e7-7f24a6af09ac",
+   "metadata": {
+    "lang": "en"
+   },
+   "source": [
+    "Native Python Type | Pandas Type | Description\n",
+    ":-----------------:|:-----------:|:-----------\n",
+    "`str`              | `object`    | The most general dtype. Will be assigned to your column if column has mixed types (numbers and strings).\n",
+    "`int`              | `int64`     | 64 bits integer\n",
+    "`float`            | `float64`   | Numeric characters with decimals. If a column contains numbers and NaNs(see below), pandas will default to float64.\n",
+    " N/A               | `datetime64`| Values meant to hold time data."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "52b2c88d-886b-4369-b632-d5302b48f303",
+   "id": "a7fc2dd4-ccf3-44a5-a9be-e032b426f937",
    "metadata": {
     "lang": "en"
    },
    "outputs": [],
    "source": [
-    "# Getting the data types of all columns\n",
-    "surveys_df.dtypes"
+    "# Getting the data type of species identifiers\n",
+    "surveys_df['species_id'].dtype"
    ]
   },
   {
@@ -81,23 +96,21 @@
    },
    "outputs": [],
    "source": [
-    "# Getting the data type of a single column\n",
+    "# Getting the data type of month values\n",
     "surveys_df['month'].dtype"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "888faaed-8326-47b3-b9e7-7f24a6af09ac",
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52b2c88d-886b-4369-b632-d5302b48f303",
    "metadata": {
     "lang": "en"
    },
+   "outputs": [],
    "source": [
-    "Native Python Type | Pandas Type | Description\n",
-    ":-----------------:|:-----------:|:-----------\n",
-    "`str`              | `object`    | The most general dtype. Will be assigned to your column if column has mixed types (numbers and strings).\n",
-    "`int`              | `int64`     | 64 bits integer\n",
-    "`float`            | `float64`   | Numeric characters with decimals. If a column contains numbers and NaNs(see below), pandas will default to float64.\n",
-    " N/A               | `datetime64`| Values meant to hold time data."
+    "# Getting the data types of all columns\n",
+    "surveys_df.dtypes"
    ]
   },
   {
@@ -321,7 +334,7 @@
    "outputs": [],
    "source": [
     "# Create a copy to avoid modifying the original object\n",
-    "copy_surveys_df = surveys_df.copy()"
+    "copy_surveys = surveys_df.copy()"
    ]
   },
   {
@@ -334,8 +347,8 @@
    "outputs": [],
    "source": [
     "# For a stable mean value\n",
-    "averageW = copy_surveys_df['weight'].mean()\n",
-    "copy_surveys_df['weight'] = copy_surveys_df['weight'].fillna(averageW)"
+    "copy_surveys['weight'] = copy_surveys['weight'].fillna(\n",
+    "    copy_surveys['weight'].mean())"
    ]
   },
   {
@@ -348,7 +361,7 @@
    "outputs": [],
    "source": [
     "# After the cleanup\n",
-    "print(copy_surveys_df['weight'].count(), copy_surveys_df['weight'].mean())"
+    "print(copy_surveys['weight'].count(), copy_surveys['weight'].mean())"
    ]
   },
   {
@@ -361,8 +374,8 @@
    "outputs": [],
    "source": [
     "# Can we now convert weight values to integers?\n",
-    "copy_surveys_df['weight'] = copy_surveys_df['weight'].astype('int64')\n",
-    "copy_surveys_df['weight'].mean()"
+    "copy_surveys['weight'] = copy_surveys['weight'].astype('int64')\n",
+    "copy_surveys['weight'].mean()"
    ]
   },
   {
@@ -373,7 +386,7 @@
    },
    "source": [
     "### Exercise - Data Cleanup\n",
-    "In the `sex` column of `copy_surveys_df`:\n",
+    "In the `sex` column of `copy_surveys`:\n",
     "* Replace undefined values by `'F|M'`\n",
     "* Any value not equal to `'F'`, `'M'` or `'F|M'` is\n",
     "  considered invalid and must be replaced by `'F|M'`\n",
@@ -394,16 +407,16 @@
    "outputs": [],
    "source": [
     "# Create invalid data\n",
-    "copy_surveys_df.loc[::123, 'sex'] = 'NA'\n",
+    "copy_surveys.loc[::123, 'sex'] = 'NA'\n",
     "\n",
     "# Replace undefined values\n",
-    "copy_surveys_df['sex'] = copy_surveys_df['sex'].fillna('F|M')\n",
+    "copy_surveys['sex'] = copy_surveys['sex'].fillna('F|M')\n",
     "\n",
     "# Replace invalid values\n",
-    "invalid_rows = ~copy_surveys_df['sex'].isin(['F', 'F|M', 'M'])\n",
-    "copy_surveys_df.loc[invalid_rows, 'sex'] = 'F|M'\n",
+    "invalid_rows = ~copy_surveys['sex'].isin(['F', 'F|M', 'M'])\n",
+    "copy_surveys.loc[invalid_rows, 'sex'] = 'F|M'\n",
     "\n",
-    "copy_surveys_df['sex'].unique()"
+    "copy_surveys['sex'].unique()"
    ]
   },
   {
@@ -426,7 +439,7 @@
    "outputs": [],
    "source": [
     "# Only keep (complete) records that have no NA\n",
-    "df_no_na = copy_surveys_df.dropna()\n",
+    "df_no_na = copy_surveys.dropna()\n",
     "df_no_na"
    ]
   },