Skip to content

Commit

Permalink
Replaced type string with camel to snake-case function (#996)
Browse files Browse the repository at this point in the history
* replaced type_string with a function that does a camel to snakecase conversion. Updated the schema version to reflect this as the LatLong and ZIPcode variable types were affected

* added pr number to changelog

* made descriptor easier to understand

* fixed changelog formatting and added docstring to ClassNameDescriptor

* changed classname descriptor docstring to be more general

* fixed typo
  • Loading branch information
tuethan1999 authored May 28, 2020
1 parent 3170cac commit ec7bcff
Show file tree
Hide file tree
Showing 8 changed files with 25 additions and 35 deletions.
5 changes: 3 additions & 2 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@ Changelog
for the ``entity_from_dataframe`` function (:pr:`988`)
* Fixes
* Fix errors with Equals and NotEquals primitives when comparing categoricals or different dtypes (:pr:`968`)
* Normalized type_strings of ``Variable`` classes so that the ``find_variable_types`` function produces a dictionary with a clear key to name transition (:pr:`982`)
* Normalized type_strings of ``Variable`` classes so that the ``find_variable_types`` function produces a
dictionary with a clear key to name transition (:pr:`982`, :pr:`996`)
* Remove pandas.datetime in test_calculate_feature_matrix due to deprecation (:pr:`998`)
* Changes
* Documentation Changes
Expand All @@ -35,7 +36,7 @@ Changelog
dataframe can be in any order as long as they are named properly.

* The ``type_string`` attributes of all ``Variable`` subclasses are now a snake case conversion of their class names. This
changes the ``type_string`` of the ``Unknown``, ``IPAddress``, ``EmailAddress``, ``SubRegionCode``, and ``FilePath`` classes.
changes the ``type_string`` of the ``Unknown``, ``IPAddress``, ``EmailAddress``, ``SubRegionCode``, ``FilePath``, ``LatLong``, and ``ZIPcode`` classes.
Old saved entitysets that used these variables may load incorrectly.

**v0.14.0 Apr 30, 2020**
Expand Down
2 changes: 1 addition & 1 deletion featuretools/entityset/serialize.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from featuretools.utils.wrangle import _is_s3, _is_url

FORMATS = ['csv', 'pickle', 'parquet']
SCHEMA_VERSION = "3.0.0"
SCHEMA_VERSION = "4.0.0"


def entity_to_description(entity):
Expand Down
2 changes: 1 addition & 1 deletion featuretools/feature_base/features_serializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from featuretools.utils.wrangle import _is_s3, _is_url
from featuretools.version import __version__ as ft_version

SCHEMA_VERSION = "4.0.0"
SCHEMA_VERSION = "5.0.0"


def save_features(features, location=None, profile_name=None):
Expand Down
2 changes: 1 addition & 1 deletion featuretools/tests/entityset_tests/test_serialization.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
BUCKET_NAME = "test-bucket"
WRITE_KEY_NAME = "test-key"
TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
TEST_FILE = "test_serialization_data_entityset_schema_3.0.0.tar"
TEST_FILE = "test_serialization_data_entityset_schema_4.0.0.tar"
S3_URL = "s3://featuretools-static/" + TEST_FILE
URL = "https://featuretools-static.s3.amazonaws.com/" + TEST_FILE
TEST_KEY = "test_access_key_es"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
BUCKET_NAME = "test-bucket"
WRITE_KEY_NAME = "test-key"
TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
TEST_FILE = "test_feature_serialization_feature_schema_4.0.0_entityset_schema_3.0.0.json"
TEST_FILE = "test_feature_serialization_feature_schema_5.0.0_entityset_schema_4.0.0.json"
S3_URL = "s3://featuretools-static/" + TEST_FILE
URL = "https://featuretools-static.s3.amazonaws.com/" + TEST_FILE
TEST_CONFIG = "CheckConfigPassesOn"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from featuretools.entityset.deserialize import description_to_entityset
from featuretools.feature_base.features_serializer import FeaturesSerializer

SCHEMA_VERSION = "4.0.0"
SCHEMA_VERSION = "5.0.0"


def test_single_feature(es):
Expand Down
6 changes: 6 additions & 0 deletions featuretools/utils/gen_utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import importlib
import re
import sys
import warnings
from itertools import zip_longest
Expand Down Expand Up @@ -85,3 +86,8 @@ def import_or_raise(library, error_msg):
return importlib.import_module(library)
except ImportError:
raise ImportError(error_msg)


def camel_to_snake(s):
s = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s).lower()
39 changes: 11 additions & 28 deletions featuretools/variable_types/variable.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
import numpy as np
import pandas as pd

from featuretools.utils.gen_utils import find_descendents
from featuretools.utils.gen_utils import camel_to_snake, find_descendents


class ClassNameDescriptor(object):
"""Descriptor to convert a class's name from camelcase to snakecase
"""
def __get__(self, instance, class_):
return camel_to_snake(class_.__name__)


class Variable(object):
Expand All @@ -18,7 +25,7 @@ class Variable(object):
See Also:
:class:`.Entity`, :class:`.Relationship`, :class:`.BaseEntitySet`
"""
type_string = None
type_string = ClassNameDescriptor()
_default_pandas_dtype = object

def __init__(self, id, entity, name=None):
Expand Down Expand Up @@ -112,12 +119,11 @@ def to_data_description(self):


class Unknown(Variable):
type_string = "unknown"
pass


class Discrete(Variable):
"""Superclass representing variables that take on discrete values"""
type_string = "discrete"

def __init__(self, id, entity, name=None):
super(Discrete, self).__init__(id, entity, name)
Expand All @@ -142,7 +148,6 @@ class Boolean(Variable):
true_values (list) : List of valued true values. Defaults to [1, True, "true", "True", "yes", "t", "T"]
false_values (list): List of valued false values. Defaults to [0, False, "false", "False", "no", "f", "F"]
"""
type_string = "boolean"
_default_pandas_dtype = bool

def __init__(self,
Expand Down Expand Up @@ -172,7 +177,6 @@ class Categorical(Discrete):
Args:
categories (list) : List of categories. If left blank, inferred from data.
"""
type_string = "categorical"

def __init__(self, id, entity, name=None, categories=None):
self.categories = None or []
Expand All @@ -186,13 +190,11 @@ def to_data_description(self):

class Id(Categorical):
"""Represents variables that identify another entity"""
type_string = "id"
_default_pandas_dtype = int


class Ordinal(Discrete):
"""Represents variables that take on an ordered discrete value"""
type_string = "ordinal"
_default_pandas_dtype = int


Expand All @@ -210,7 +212,6 @@ class Numeric(Variable):
std (float)
mean (float)
"""
type_string = "numeric"
_default_pandas_dtype = float

def __init__(self,
Expand Down Expand Up @@ -241,7 +242,6 @@ class Index(Variable):
Attributes:
count (int)
"""
type_string = "index"
_default_pandas_dtype = int


Expand All @@ -251,7 +251,6 @@ class Datetime(Variable):
Args:
format (str): Python datetime format string documented `here <http://strftime.org/>`_.
"""
type_string = "datetime"
_default_pandas_dtype = np.datetime64

def __init__(self, id, entity, name=None, format=None):
Expand All @@ -269,19 +268,16 @@ def to_data_description(self):

class TimeIndex(Variable):
"""Represents time index of entity"""
type_string = "time_index"
_default_pandas_dtype = np.datetime64


class NumericTimeIndex(TimeIndex, Numeric):
"""Represents time index of entity that is numeric"""
type_string = "numeric_time_index"
_default_pandas_dtype = float


class DatetimeTimeIndex(TimeIndex, Datetime):
"""Represents time index of entity that is a datetime"""
type_string = "datetime_time_index"
_default_pandas_dtype = np.datetime64


Expand All @@ -293,7 +289,6 @@ class Timedelta(Variable):
start_inclusive (bool, optional) : Whether or not range includes the start value.
end_inclusive (bool, optional) : Whether or not range includes the end value
"""
type_string = "timedelta"
_default_pandas_dtype = np.timedelta64

def __init__(self,
Expand All @@ -320,7 +315,6 @@ def to_data_description(self):

class Text(Variable):
"""Represents variables that are arbitary strings"""
type_string = "text"
_default_pandas_dtype = str


Expand All @@ -338,45 +332,39 @@ class LatLong(Variable):
To make a latlong in a dataframe do
data['latlong'] = data[['latitude', 'longitude']].apply(tuple, axis=1)
"""
type_string = "latlong"


class ZIPCode(Categorical):
"""Represents a postal address in the United States.
Consists of a series of digits which are casts as
string. Five digit and 9 digit zipcodes are supported.
"""
type_string = "zipcode"
_default_pandas_dtype = str


class IPAddress(Variable):
"""Represents a computer network address. Represented
in dotted-decimal notation. IPv4 and IPv6 are supported.
"""
type_string = "ip_address"
_default_pandas_dtype = str


class FullName(Variable):
"""Represents a person's full name. May consist of a
first name, last name, and a title.
"""
type_string = "full_name"
_default_pandas_dtype = str


class EmailAddress(Variable):
"""Represents an email box to which email message are sent.
Consists of a local-part, an @ symbol, and a domain.
"""
type_string = "email_address"
_default_pandas_dtype = str


class URL(Variable):
"""Represents a valid web url (with or without http/www)"""
type_string = "url"
_default_pandas_dtype = str


Expand All @@ -385,13 +373,11 @@ class PhoneNumber(Variable):
Can be with/without parenthesis.
Can be with/without area/country codes.
"""
type_string = "phone_number"
_default_pandas_dtype = str


class DateOfBirth(Datetime):
"""Represents a date of birth as a datetime"""
type_string = "date_of_birth"
_default_pandas_dtype = np.datetime64


Expand All @@ -401,7 +387,6 @@ class CountryCode(Categorical):
should be in the Alpha-2 format.
e.g. United States of America = US
"""
type_string = "country_code"
_default_pandas_dtype = str


Expand All @@ -411,19 +396,17 @@ class SubRegionCode(Categorical):
should be in the Alpha-2 format.
e.g. United States of America, Arizona = US-AZ
"""
type_string = "sub_region_code"
_default_pandas_dtype = str


class FilePath(Variable):
"""Represents a valid filepath, absolute or relative"""
type_string = "file_path"
_default_pandas_dtype = str


def find_variable_types():
return {vtype.type_string: vtype for vtype in find_descendents(Variable)
if issubclass(vtype, Variable) and vtype.type_string}
if vtype != Variable}


DEFAULT_DTYPE_VALUES = {
Expand Down

0 comments on commit ec7bcff

Please sign in to comment.