-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b11c70e
commit 8a64f7e
Showing
15 changed files
with
290 additions
and
2,789 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
name: Construction image Docker | ||
|
||
on: [push] | ||
|
||
jobs: | ||
docker: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- | ||
name: Set up QEMU | ||
uses: docker/setup-qemu-action@v3 | ||
- | ||
name: Set up Docker Buildx | ||
uses: docker/setup-buildx-action@v3 | ||
- | ||
name: Login to Docker Hub | ||
uses: docker/login-action@v3 | ||
with: | ||
username: ${{ secrets.DOCKERHUB_USERNAME }} | ||
password: ${{ secrets.DOCKERHUB_TOKEN }} | ||
- | ||
name: Build and push | ||
uses: docker/build-push-action@v5 | ||
with: | ||
push: true | ||
tags: linogaliana/application-correction:latest |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
name: Python package | ||
|
||
on: [push] | ||
|
||
jobs: | ||
build: | ||
|
||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v4 | ||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
# latest python minor | ||
python-version: '3.x' | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -r requirements.txt | ||
pip install pylint | ||
- name: Lint | ||
run: | | ||
pylint src --fail-under=6 | ||
- name: Test Python code | ||
run: python main.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
config.yaml | ||
__pycache__/ | ||
data/ | ||
titanic/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
FROM ubuntu:22.04 | ||
WORKDIR ${HOME}/titanic | ||
# Install Python | ||
RUN apt-get -y update && \ | ||
apt-get install -y python3-pip | ||
# Install project dependencies | ||
COPY requirements.txt . | ||
RUN pip install -r requirements.txt | ||
COPY main.py . | ||
COPY src ./src | ||
CMD ["python3", "main.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Probabilité de survie sur le Titanic | ||
|
||
Pour pouvoir utiliser ce projet, il | ||
est recommandé de créer un fichier `config.yaml` | ||
ayant la structure suivante: | ||
|
||
```yaml | ||
jeton_api: #### | ||
data_path: https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv | ||
``` | ||
Pour installer les dépendances | ||
```bash | ||
pip install -r requirements.txt | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
#!/bin/bash | ||
# Install Python | ||
apt-get -y update | ||
apt-get install -y python3-pip python3-venv | ||
# Create empty virtual environment | ||
python3 -m venv titanic | ||
source titanic/bin/activate | ||
# Install project dependencies | ||
pip install -r requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
""" | ||
Prediction de la survie d'un individu sur le Titanic | ||
""" | ||
|
||
import argparse | ||
import pathlib | ||
import pandas as pd | ||
|
||
from src.data.import_data import import_yaml_config, split_and_count | ||
from src.pipeline.build_pipeline import split_train_test, create_pipeline | ||
from src.models.train_evaluate import evaluate_model | ||
|
||
parser = argparse.ArgumentParser(description="Paramètres du random forest") | ||
parser.add_argument("--n_trees", type=int, default=20, help="Nombre d'arbres") | ||
args = parser.parse_args() | ||
|
||
n_trees = args.n_trees | ||
|
||
URL_RAW = "https://minio.lab.sspcloud.fr/lgaliana/ensae-reproductibilite/data/raw/data.csv" | ||
config = import_yaml_config("configuration/config.yaml") | ||
jeton_api = config.get("jeton_api") | ||
data_path = config.get("data_path", URL_RAW) | ||
data_train_path = config.get("train_path", "data/derived/train.csv") | ||
data_test_path = config.get("test_path", "data/derived/test.csv") | ||
|
||
MAX_DEPTH = None | ||
MAX_FEATURES = "sqrt" | ||
|
||
|
||
# IMPORT ET EXPLORATION DONNEES -------------------------------- | ||
|
||
TrainingData = pd.read_csv(data_path) | ||
|
||
|
||
# Usage example: | ||
ticket_count = split_and_count(TrainingData, "Ticket", "/") | ||
name_count = split_and_count(TrainingData, "Name", ",") | ||
|
||
|
||
# SPLIT TRAIN/TEST -------------------------------- | ||
|
||
p = pathlib.Path("data/derived/") | ||
p.mkdir(parents=True, exist_ok=True) | ||
|
||
X_train, X_test, y_train, y_test = split_train_test( | ||
TrainingData, test_size=0.1, | ||
train_path=data_train_path, | ||
test_path=data_test_path | ||
) | ||
|
||
|
||
# PIPELINE ---------------------------- | ||
|
||
|
||
# Create the pipeline | ||
pipe = create_pipeline( | ||
n_trees, max_depth=MAX_DEPTH, max_features=MAX_FEATURES | ||
) | ||
|
||
|
||
# ESTIMATION ET EVALUATION ---------------------- | ||
|
||
pipe.fit(X_train, y_train) | ||
|
||
|
||
# Evaluate the model | ||
score, matrix = evaluate_model(pipe, X_test, y_test) | ||
print(f"{score:.1%} de bonnes réponses sur les données de test pour validation") | ||
print(20 * "-") | ||
print("matrice de confusion") | ||
print(matrix) |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
pandas | ||
PyYAML | ||
scikit-learn |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from sklearn.metrics import confusion_matrix | ||
|
||
|
||
def evaluate_model(pipe, X_test, y_test): | ||
""" | ||
Evaluate the model by calculating the score and confusion matrix. | ||
Args: | ||
pipe (sklearn.pipeline.Pipeline): The trained pipeline object. | ||
X_test (pandas.DataFrame): The test data. | ||
y_test (pandas.Series): The true labels for the test data. | ||
Returns: | ||
tuple: A tuple containing the score and confusion matrix. | ||
""" | ||
score = pipe.score(X_test, y_test) | ||
matrix = confusion_matrix(y_test, pipe.predict(X_test)) | ||
return score, matrix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.pipeline import Pipeline | ||
from sklearn.compose import ColumnTransformer | ||
from sklearn.model_selection import train_test_split | ||
|
||
import pandas as pd | ||
|
||
|
||
def split_train_test(data, test_size, train_path="train.csv", test_path="test.csv"): | ||
""" | ||
Split the data into training and testing sets based on the specified test size. | ||
Optionally, save the split datasets to CSV files. | ||
Args: | ||
data (pandas.DataFrame): The input data to split. | ||
test_size (float): The proportion of the dataset to include in the test split. | ||
train_path (str, optional): The file path to save the training dataset. | ||
Defaults to "train.csv". | ||
test_path (str, optional): The file path to save the testing dataset. | ||
Defaults to "test.csv". | ||
Returns: | ||
tuple: A tuple containing the training and testing datasets. | ||
""" | ||
y = data["Survived"] | ||
X = data.drop("Survived", axis="columns") | ||
|
||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) | ||
|
||
if train_path: | ||
pd.concat([X_train, y_train]).to_csv(train_path) | ||
if test_path: | ||
pd.concat([X_test, y_test]).to_csv(test_path) | ||
|
||
return X_train, X_test, y_train, y_test | ||
|
||
|
||
def create_pipeline( | ||
n_trees, | ||
numeric_features=["Age", "Fare"], | ||
categorical_features=["Embarked", "Sex"], | ||
max_depth=None, | ||
max_features="sqrt", | ||
): | ||
""" | ||
Create a pipeline for preprocessing and model definition. | ||
Args: | ||
n_trees (int): The number of trees in the random forest. | ||
numeric_features (list, optional): The numeric features to be included in the pipeline. | ||
Defaults to ["Age", "Fare"]. | ||
categorical_features (list, optional): The categorical features to be included | ||
in the pipeline. | ||
Defaults to ["Embarked", "Sex"]. | ||
max_depth (int, optional): The maximum depth of the random forest. Defaults to None. | ||
max_features (str, optional): The maximum number of features to consider | ||
when looking for the best split. | ||
Defaults to "sqrt". | ||
Returns: | ||
sklearn.pipeline.Pipeline: The pipeline object. | ||
""" | ||
# Variables numériques | ||
numeric_transformer = Pipeline( | ||
steps=[ | ||
("imputer", SimpleImputer(strategy="median")), | ||
("scaler", MinMaxScaler()), | ||
] | ||
) | ||
|
||
# Variables catégorielles | ||
categorical_transformer = Pipeline( | ||
steps=[ | ||
("imputer", SimpleImputer(strategy="most_frequent")), | ||
("onehot", OneHotEncoder()), | ||
] | ||
) | ||
|
||
# Preprocessing | ||
preprocessor = ColumnTransformer( | ||
transformers=[ | ||
("Preprocessing numerical", numeric_transformer, numeric_features), | ||
( | ||
"Preprocessing categorical", | ||
categorical_transformer, | ||
categorical_features, | ||
), | ||
] | ||
) | ||
|
||
# Pipeline | ||
pipe = Pipeline( | ||
[ | ||
("preprocessor", preprocessor), | ||
( | ||
"classifier", | ||
RandomForestClassifier( | ||
n_estimators=n_trees, max_depth=max_depth, max_features=max_features | ||
), | ||
), | ||
] | ||
) | ||
|
||
return pipe |
Oops, something went wrong.