Skip to content

Commit

Permalink
grostest
Browse files Browse the repository at this point in the history
  • Loading branch information
linogaliana committed Mar 15, 2024
1 parent b11c70e commit 8a64f7e
Show file tree
Hide file tree
Showing 15 changed files with 290 additions and 2,789 deletions.
26 changes: 26 additions & 0 deletions .github/workflows/prod.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Construction image Docker

on: [push]

jobs:
docker:
runs-on: ubuntu-latest
steps:
-
name: Set up QEMU
uses: docker/setup-qemu-action@v3
-
name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
-
name: Login to Docker Hub
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
-
name: Build and push
uses: docker/build-push-action@v5
with:
push: true
tags: linogaliana/application-correction:latest
26 changes: 26 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: Python package

on: [push]

jobs:
build:

runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v4
with:
# latest python minor
python-version: '3.x'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
pip install pylint
- name: Lint
run: |
pylint src --fail-under=6
- name: Test Python code
run: python main.py
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
config.yaml
__pycache__/
data/
titanic/
11 changes: 11 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
FROM ubuntu:22.04
WORKDIR ${HOME}/titanic
# Install Python
RUN apt-get -y update && \
apt-get install -y python3-pip
# Install project dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt
COPY main.py .
COPY src ./src
CMD ["python3", "main.py"]
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Probabilité de survie sur le Titanic

Pour pouvoir utiliser ce projet, il
est recommandé de créer un fichier `config.yaml`
ayant la structure suivante:

```yaml
jeton_api: ####
data_path: https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv
```
Pour installer les dépendances
```bash
pip install -r requirements.txt
```
892 changes: 0 additions & 892 deletions data.csv

This file was deleted.

9 changes: 9 additions & 0 deletions install.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# Install Python
apt-get -y update
apt-get install -y python3-pip python3-venv
# Create empty virtual environment
python3 -m venv titanic
source titanic/bin/activate
# Install project dependencies
pip install -r requirements.txt
71 changes: 71 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""
Prediction de la survie d'un individu sur le Titanic
"""

import argparse
import pathlib
import pandas as pd

from src.data.import_data import import_yaml_config, split_and_count
from src.pipeline.build_pipeline import split_train_test, create_pipeline
from src.models.train_evaluate import evaluate_model

parser = argparse.ArgumentParser(description="Paramètres du random forest")
parser.add_argument("--n_trees", type=int, default=20, help="Nombre d'arbres")
args = parser.parse_args()

n_trees = args.n_trees

URL_RAW = "https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv"
config = import_yaml_config("configuration/config.yaml")
jeton_api = config.get("jeton_api")
data_path = config.get("data_path", URL_RAW)
data_train_path = config.get("train_path", "data/derived/train.csv")
data_test_path = config.get("test_path", "data/derived/test.csv")

MAX_DEPTH = None
MAX_FEATURES = "sqrt"


# IMPORT ET EXPLORATION DONNEES --------------------------------

TrainingData = pd.read_csv(data_path)


# Usage example:
ticket_count = split_and_count(TrainingData, "Ticket", "/")
name_count = split_and_count(TrainingData, "Name", ",")


# SPLIT TRAIN/TEST --------------------------------

p = pathlib.Path("data/derived/")
p.mkdir(parents=True, exist_ok=True)

X_train, X_test, y_train, y_test = split_train_test(
TrainingData, test_size=0.1,
train_path=data_train_path,
test_path=data_test_path
)


# PIPELINE ----------------------------


# Create the pipeline
pipe = create_pipeline(
n_trees, max_depth=MAX_DEPTH, max_features=MAX_FEATURES
)


# ESTIMATION ET EVALUATION ----------------------

pipe.fit(X_train, y_train)


# Evaluate the model
score, matrix = evaluate_model(pipe, X_test, y_test)
print(f"{score:.1%} de bonnes réponses sur les données de test pour validation")
print(20 * "-")
print("matrice de confusion")
print(matrix)
File renamed without changes.
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
pandas
PyYAML
scikit-learn
18 changes: 18 additions & 0 deletions src/models/train_evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from sklearn.metrics import confusion_matrix


def evaluate_model(pipe, X_test, y_test):
"""
Evaluate the model by calculating the score and confusion matrix.
Args:
pipe (sklearn.pipeline.Pipeline): The trained pipeline object.
X_test (pandas.DataFrame): The test data.
y_test (pandas.Series): The true labels for the test data.
Returns:
tuple: A tuple containing the score and confusion matrix.
"""
score = pipe.score(X_test, y_test)
matrix = confusion_matrix(y_test, pipe.predict(X_test))
return score, matrix
106 changes: 106 additions & 0 deletions src/pipeline/build_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

import pandas as pd


def split_train_test(data, test_size, train_path="train.csv", test_path="test.csv"):
"""
Split the data into training and testing sets based on the specified test size.
Optionally, save the split datasets to CSV files.
Args:
data (pandas.DataFrame): The input data to split.
test_size (float): The proportion of the dataset to include in the test split.
train_path (str, optional): The file path to save the training dataset.
Defaults to "train.csv".
test_path (str, optional): The file path to save the testing dataset.
Defaults to "test.csv".
Returns:
tuple: A tuple containing the training and testing datasets.
"""
y = data["Survived"]
X = data.drop("Survived", axis="columns")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

if train_path:
pd.concat([X_train, y_train]).to_csv(train_path)
if test_path:
pd.concat([X_test, y_test]).to_csv(test_path)

return X_train, X_test, y_train, y_test


def create_pipeline(
n_trees,
numeric_features=["Age", "Fare"],
categorical_features=["Embarked", "Sex"],
max_depth=None,
max_features="sqrt",
):
"""
Create a pipeline for preprocessing and model definition.
Args:
n_trees (int): The number of trees in the random forest.
numeric_features (list, optional): The numeric features to be included in the pipeline.
Defaults to ["Age", "Fare"].
categorical_features (list, optional): The categorical features to be included
in the pipeline.
Defaults to ["Embarked", "Sex"].
max_depth (int, optional): The maximum depth of the random forest. Defaults to None.
max_features (str, optional): The maximum number of features to consider
when looking for the best split.
Defaults to "sqrt".
Returns:
sklearn.pipeline.Pipeline: The pipeline object.
"""
# Variables numériques
numeric_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="median")),
("scaler", MinMaxScaler()),
]
)

# Variables catégorielles
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder()),
]
)

# Preprocessing
preprocessor = ColumnTransformer(
transformers=[
("Preprocessing numerical", numeric_transformer, numeric_features),
(
"Preprocessing categorical",
categorical_transformer,
categorical_features,
),
]
)

# Pipeline
pipe = Pipeline(
[
("preprocessor", preprocessor),
(
"classifier",
RandomForestClassifier(
n_estimators=n_trees, max_depth=max_depth, max_features=max_features
),
),
]
)

return pipe
Loading

0 comments on commit 8a64f7e

Please sign in to comment.