Skip to content

Commit

Permalink
Merge pull request #155 from MAIF/update_data
Browse files Browse the repository at this point in the history
Update data module
  • Loading branch information
HugoPerrier authored Dec 19, 2023
2 parents 3ecb947 + ae1afa3 commit 262f563
Show file tree
Hide file tree
Showing 7 changed files with 16 additions and 12,364 deletions.
8 changes: 3 additions & 5 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
include AUTHORS.rst
include CONTRIBUTING.rst
include HISTORY.rst
include LICENSE
include README.rst
include Makefile

recursive-include tests *
recursive-exclude * __pycache__
recursive-exclude * *.py[co]

recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
recursive-include docs *.rst *.jpg *.png *.gif
recursive-include melusine *.yaml *.json
30 changes: 6 additions & 24 deletions melusine/data/_data_loader.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,10 @@
import os.path as op

import pandas as pd
from pathlib import Path


def load_email_data(type: str = "raw") -> pd.DataFrame:
def load_email_data() -> pd.DataFrame:
"""
Function to load a file containing toy email data.
Possible types are:
- raw : minimal DataFrame with email data
- preprocessed : DataFrame with preprocessed email data
- full : Full DataFrame with all email features
Return
------
Expand All @@ -18,23 +13,10 @@ def load_email_data(type: str = "raw") -> pd.DataFrame:
"""

# Path to data directory
data_directory = op.dirname(op.abspath(__file__))

# Load raw data
if type == "raw":
email_data_path = op.join(data_directory, "emails.json")
df = pd.read_json(email_data_path, orient="records").fillna("")

# Load preprocessed data
elif type == "preprocessed":
email_data_path = op.join(data_directory, "emails_preprocessed.json")
df = pd.read_json(email_data_path, orient="records").fillna("")
data_directory = Path(__file__).parent.resolve()

# Load preprocessed data with feature engineering
elif type == "full":
email_data_path = op.join(data_directory, "emails_full.json")
df = pd.read_json(email_data_path, orient="records").fillna("")
else:
raise ValueError(f"Unknown data type {type}. Choose between 'raw', 'preprocessed' and 'full'")
# Load data
email_data_path = data_directory / "emails.json"
df = pd.read_json(email_data_path, orient="records").fillna("")

return df
Loading

0 comments on commit 262f563

Please sign in to comment.