grostest

ensae-reproductibilite · Mar 15, 2024 · 8a64f7e · 8a64f7e
1 parent b11c70e
commit 8a64f7e
Show file tree

Hide file tree

Showing 15 changed files with 290 additions and 2,789 deletions.
diff --git a/.github/workflows/prod.yaml b/.github/workflows/prod.yaml
@@ -0,0 +1,26 @@
+name: Construction image Docker
+
+on: [push]
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    steps:
+      -
+        name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_TOKEN }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          push: true
+          tags: linogaliana/application-correction:latest
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -0,0 +1,26 @@
+name: Python package
+
+on: [push]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          # latest python minor
+          python-version: '3.x'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+          pip install pylint
+      - name: Lint
+        run: |
+          pylint src --fail-under=6
+      - name: Test Python code
+        run: python main.py
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,4 @@
+config.yaml
+__pycache__/
+data/
+titanic/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,11 @@
+FROM ubuntu:22.04
+WORKDIR ${HOME}/titanic
+# Install Python
+RUN apt-get -y update && \
+    apt-get install -y python3-pip
+# Install project dependencies
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+COPY main.py .
+COPY src ./src
+CMD ["python3", "main.py"]
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+# Probabilité de survie sur le Titanic
+
+Pour pouvoir utiliser ce projet, il 
+est recommandé de créer un fichier `config.yaml`
+ayant la structure suivante:
+
+```yaml
+jeton_api: ####
+data_path: https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv
+```
+
+Pour installer les dépendances
+
+```bash
+pip install -r requirements.txt
+```
diff --git a/data.csv b/data.csv
diff --git a/install.sh b/install.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Install Python
+apt-get -y update
+apt-get install -y python3-pip python3-venv
+# Create empty virtual environment
+python3 -m venv titanic
+source titanic/bin/activate
+# Install project dependencies
+pip install -r requirements.txt
diff --git a/main.py b/main.py
@@ -0,0 +1,71 @@
+"""
+Prediction de la survie d'un individu sur le Titanic
+"""
+
+import argparse
+import pathlib
+import pandas as pd
+
+from src.data.import_data import import_yaml_config, split_and_count
+from src.pipeline.build_pipeline import split_train_test, create_pipeline
+from src.models.train_evaluate import evaluate_model
+
+parser = argparse.ArgumentParser(description="Paramètres du random forest")
+parser.add_argument("--n_trees", type=int, default=20, help="Nombre d'arbres")
+args = parser.parse_args()
+
+n_trees = args.n_trees
+
+URL_RAW = "https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv"
+config = import_yaml_config("configuration/config.yaml")
+jeton_api = config.get("jeton_api")
+data_path = config.get("data_path", URL_RAW)
+data_train_path = config.get("train_path", "data/derived/train.csv")
+data_test_path = config.get("test_path", "data/derived/test.csv")
+
+MAX_DEPTH = None
+MAX_FEATURES = "sqrt"
+
+
+# IMPORT ET EXPLORATION DONNEES --------------------------------
+
+TrainingData = pd.read_csv(data_path)
+
+
+# Usage example:
+ticket_count = split_and_count(TrainingData, "Ticket", "/")
+name_count = split_and_count(TrainingData, "Name", ",")
+
+
+# SPLIT TRAIN/TEST --------------------------------
+
+p = pathlib.Path("data/derived/")
+p.mkdir(parents=True, exist_ok=True)
+
+X_train, X_test, y_train, y_test = split_train_test(
+    TrainingData, test_size=0.1,
+    train_path=data_train_path,
+    test_path=data_test_path
+)
+
+
+# PIPELINE ----------------------------
+
+
+# Create the pipeline
+pipe = create_pipeline(
+    n_trees, max_depth=MAX_DEPTH, max_features=MAX_FEATURES
+)
+
+
+# ESTIMATION ET EVALUATION ----------------------
+
+pipe.fit(X_train, y_train)
+
+
+# Evaluate the model
+score, matrix = evaluate_model(pipe, X_test, y_test)
+print(f"{score:.1%} de bonnes réponses sur les données de test pour validation")
+print(20 * "-")
+print("matrice de confusion")
+print(matrix)
diff --git a/titanic.ipynb → notebooks/titanic.ipynb b/titanic.ipynb → notebooks/titanic.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+pandas
+PyYAML
+scikit-learn
diff --git a/src/models/train_evaluate.py b/src/models/train_evaluate.py
@@ -0,0 +1,18 @@
+from sklearn.metrics import confusion_matrix
+
+
+def evaluate_model(pipe, X_test, y_test):
+    """
+    Evaluate the model by calculating the score and confusion matrix.
+
+    Args:
+        pipe (sklearn.pipeline.Pipeline): The trained pipeline object.
+        X_test (pandas.DataFrame): The test data.
+        y_test (pandas.Series): The true labels for the test data.
+
+    Returns:
+        tuple: A tuple containing the score and confusion matrix.
+    """
+    score = pipe.score(X_test, y_test)
+    matrix = confusion_matrix(y_test, pipe.predict(X_test))
+    return score, matrix
diff --git a/src/pipeline/build_pipeline.py b/src/pipeline/build_pipeline.py
@@ -0,0 +1,106 @@
+from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.compose import ColumnTransformer
+from sklearn.model_selection import train_test_split
+
+import pandas as pd
+
+
+def split_train_test(data, test_size, train_path="train.csv", test_path="test.csv"):
+    """
+    Split the data into training and testing sets based on the specified test size.
+    Optionally, save the split datasets to CSV files.
+
+    Args:
+        data (pandas.DataFrame): The input data to split.
+        test_size (float): The proportion of the dataset to include in the test split.
+        train_path (str, optional): The file path to save the training dataset.
+            Defaults to "train.csv".
+        test_path (str, optional): The file path to save the testing dataset.
+            Defaults to "test.csv".
+
+    Returns:
+        tuple: A tuple containing the training and testing datasets.
+    """
+    y = data["Survived"]
+    X = data.drop("Survived", axis="columns")
+
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
+
+    if train_path:
+        pd.concat([X_train, y_train]).to_csv(train_path)
+    if test_path:
+        pd.concat([X_test, y_test]).to_csv(test_path)
+
+    return X_train, X_test, y_train, y_test
+
+
+def create_pipeline(
+    n_trees,
+    numeric_features=["Age", "Fare"],
+    categorical_features=["Embarked", "Sex"],
+    max_depth=None,
+    max_features="sqrt",
+):
+    """
+    Create a pipeline for preprocessing and model definition.
+
+    Args:
+        n_trees (int): The number of trees in the random forest.
+        numeric_features (list, optional): The numeric features to be included in the pipeline.
+            Defaults to ["Age", "Fare"].
+        categorical_features (list, optional): The categorical features to be included
+            in the pipeline.
+            Defaults to ["Embarked", "Sex"].
+        max_depth (int, optional): The maximum depth of the random forest. Defaults to None.
+        max_features (str, optional): The maximum number of features to consider
+            when looking for the best split.
+            Defaults to "sqrt".
+
+    Returns:
+        sklearn.pipeline.Pipeline: The pipeline object.
+    """
+    # Variables numériques
+    numeric_transformer = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="median")),
+            ("scaler", MinMaxScaler()),
+        ]
+    )
+
+    # Variables catégorielles
+    categorical_transformer = Pipeline(
+        steps=[
+            ("imputer", SimpleImputer(strategy="most_frequent")),
+            ("onehot", OneHotEncoder()),
+        ]
+    )
+
+    # Preprocessing
+    preprocessor = ColumnTransformer(
+        transformers=[
+            ("Preprocessing numerical", numeric_transformer, numeric_features),
+            (
+                "Preprocessing categorical",
+                categorical_transformer,
+                categorical_features,
+            ),
+        ]
+    )
+
+    # Pipeline
+    pipe = Pipeline(
+        [
+            ("preprocessor", preprocessor),
+            (
+                "classifier",
+                RandomForestClassifier(
+                    n_estimators=n_trees, max_depth=max_depth, max_features=max_features
+                ),
+            ),
+        ]
+    )
+
+    return pipe