Replaced type string with camel to snake-case function (#996)

* replaced type_string with a function that does a camel to snakecase conversion. Updated the schema version to reflect this as the LatLong and ZIPcode variable types were affected * added pr number to changelog * made descriptor easier to understand * fixed changelog formatting and added docstring to ClassNameDescriptor * changed classname descriptor docstring to be more general * fixed typo
alteryx · May 28, 2020 · ec7bcff · ec7bcff
1 parent 3170cac
commit ec7bcff
Show file tree

Hide file tree

Showing 8 changed files with 25 additions and 35 deletions.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -12,7 +12,8 @@ Changelog
           for the ``entity_from_dataframe`` function (:pr:`988`)
     * Fixes
         * Fix errors with Equals and NotEquals primitives when comparing categoricals or different dtypes (:pr:`968`)
-        * Normalized type_strings of ``Variable`` classes so that the ``find_variable_types`` function produces a dictionary with a clear key to name transition (:pr:`982`)
+        * Normalized type_strings of ``Variable`` classes so that the ``find_variable_types`` function produces a 
+          dictionary with a clear key to name transition (:pr:`982`, :pr:`996`)
         * Remove pandas.datetime in test_calculate_feature_matrix due to deprecation (:pr:`998`)
     * Changes
     * Documentation Changes
@@ -35,7 +36,7 @@ Changelog
   dataframe can be in any order as long as they are named properly.
 
 * The ``type_string`` attributes of all ``Variable`` subclasses are now a snake case conversion of their class names. This
-  changes the ``type_string`` of the ``Unknown``, ``IPAddress``, ``EmailAddress``, ``SubRegionCode``, and ``FilePath`` classes.
+  changes the ``type_string`` of the ``Unknown``, ``IPAddress``, ``EmailAddress``, ``SubRegionCode``, ``FilePath``, ``LatLong``, and ``ZIPcode`` classes.
   Old saved entitysets that used these variables may load incorrectly.
 
 **v0.14.0 Apr 30, 2020**

diff --git a/featuretools/entityset/serialize.py b/featuretools/entityset/serialize.py
@@ -8,7 +8,7 @@
 from featuretools.utils.wrangle import _is_s3, _is_url
 
 FORMATS = ['csv', 'pickle', 'parquet']
-SCHEMA_VERSION = "3.0.0"
+SCHEMA_VERSION = "4.0.0"
 
 
 def entity_to_description(entity):

diff --git a/featuretools/feature_base/features_serializer.py b/featuretools/feature_base/features_serializer.py
@@ -7,7 +7,7 @@
 from featuretools.utils.wrangle import _is_s3, _is_url
 from featuretools.version import __version__ as ft_version
 
-SCHEMA_VERSION = "4.0.0"
+SCHEMA_VERSION = "5.0.0"
 
 
 def save_features(features, location=None, profile_name=None):

diff --git a/featuretools/tests/entityset_tests/test_serialization.py b/featuretools/tests/entityset_tests/test_serialization.py
@@ -17,7 +17,7 @@
 BUCKET_NAME = "test-bucket"
 WRITE_KEY_NAME = "test-key"
 TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
-TEST_FILE = "test_serialization_data_entityset_schema_3.0.0.tar"
+TEST_FILE = "test_serialization_data_entityset_schema_4.0.0.tar"
 S3_URL = "s3://featuretools-static/" + TEST_FILE
 URL = "https://featuretools-static.s3.amazonaws.com/" + TEST_FILE
 TEST_KEY = "test_access_key_es"

diff --git a/featuretools/tests/primitive_tests/test_feature_serialization.py b/featuretools/tests/primitive_tests/test_feature_serialization.py
@@ -36,7 +36,7 @@
 BUCKET_NAME = "test-bucket"
 WRITE_KEY_NAME = "test-key"
 TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
-TEST_FILE = "test_feature_serialization_feature_schema_4.0.0_entityset_schema_3.0.0.json"
+TEST_FILE = "test_feature_serialization_feature_schema_5.0.0_entityset_schema_4.0.0.json"
 S3_URL = "s3://featuretools-static/" + TEST_FILE
 URL = "https://featuretools-static.s3.amazonaws.com/" + TEST_FILE
 TEST_CONFIG = "CheckConfigPassesOn"

diff --git a/featuretools/tests/primitive_tests/test_features_serializer.py b/featuretools/tests/primitive_tests/test_features_serializer.py
@@ -4,7 +4,7 @@
 from featuretools.entityset.deserialize import description_to_entityset
 from featuretools.feature_base.features_serializer import FeaturesSerializer
 
-SCHEMA_VERSION = "4.0.0"
+SCHEMA_VERSION = "5.0.0"
 
 
 def test_single_feature(es):

diff --git a/featuretools/utils/gen_utils.py b/featuretools/utils/gen_utils.py
@@ -1,4 +1,5 @@
 import importlib
+import re
 import sys
 import warnings
 from itertools import zip_longest
@@ -85,3 +86,8 @@ def import_or_raise(library, error_msg):
         return importlib.import_module(library)
     except ImportError:
         raise ImportError(error_msg)
+
+
+def camel_to_snake(s):
+    s = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s).lower()
diff --git a/featuretools/variable_types/variable.py b/featuretools/variable_types/variable.py
@@ -1,7 +1,14 @@
 import numpy as np
 import pandas as pd
 
-from featuretools.utils.gen_utils import find_descendents
+from featuretools.utils.gen_utils import camel_to_snake, find_descendents
+
+
+class ClassNameDescriptor(object):
+    """Descriptor to convert a class's name from camelcase to snakecase
+    """
+    def __get__(self, instance, class_):
+        return camel_to_snake(class_.__name__)
 
 
 class Variable(object):
@@ -18,7 +25,7 @@ class Variable(object):
     See Also:
         :class:`.Entity`, :class:`.Relationship`, :class:`.BaseEntitySet`
     """
-    type_string = None
+    type_string = ClassNameDescriptor()
     _default_pandas_dtype = object
 
     def __init__(self, id, entity, name=None):
@@ -112,12 +119,11 @@ def to_data_description(self):
 
 
 class Unknown(Variable):
-    type_string = "unknown"
+    pass
 
 
 class Discrete(Variable):
     """Superclass representing variables that take on discrete values"""
-    type_string = "discrete"
 
     def __init__(self, id, entity, name=None):
         super(Discrete, self).__init__(id, entity, name)
@@ -142,7 +148,6 @@ class Boolean(Variable):
         true_values (list) : List of valued true values. Defaults to [1, True, "true", "True", "yes", "t", "T"]
         false_values (list): List of valued false values. Defaults to [0, False, "false", "False", "no", "f", "F"]
     """
-    type_string = "boolean"
     _default_pandas_dtype = bool
 
     def __init__(self,
@@ -172,7 +177,6 @@ class Categorical(Discrete):
     Args:
         categories (list) : List of categories. If left blank, inferred from data.
     """
-    type_string = "categorical"
 
     def __init__(self, id, entity, name=None, categories=None):
         self.categories = None or []
@@ -186,13 +190,11 @@ def to_data_description(self):
 
 class Id(Categorical):
     """Represents variables that identify another entity"""
-    type_string = "id"
     _default_pandas_dtype = int
 
 
 class Ordinal(Discrete):
     """Represents variables that take on an ordered discrete value"""
-    type_string = "ordinal"
     _default_pandas_dtype = int
 
 
@@ -210,7 +212,6 @@ class Numeric(Variable):
         std (float)
         mean (float)
     """
-    type_string = "numeric"
     _default_pandas_dtype = float
 
     def __init__(self,
@@ -241,7 +242,6 @@ class Index(Variable):
     Attributes:
         count (int)
     """
-    type_string = "index"
     _default_pandas_dtype = int
 
 
@@ -251,7 +251,6 @@ class Datetime(Variable):
     Args:
         format (str): Python datetime format string documented `here <http://strftime.org/>`_.
     """
-    type_string = "datetime"
     _default_pandas_dtype = np.datetime64
 
     def __init__(self, id, entity, name=None, format=None):
@@ -269,19 +268,16 @@ def to_data_description(self):
 
 class TimeIndex(Variable):
     """Represents time index of entity"""
-    type_string = "time_index"
     _default_pandas_dtype = np.datetime64
 
 
 class NumericTimeIndex(TimeIndex, Numeric):
     """Represents time index of entity that is numeric"""
-    type_string = "numeric_time_index"
     _default_pandas_dtype = float
 
 
 class DatetimeTimeIndex(TimeIndex, Datetime):
     """Represents time index of entity that is a datetime"""
-    type_string = "datetime_time_index"
     _default_pandas_dtype = np.datetime64
 
 
@@ -293,7 +289,6 @@ class Timedelta(Variable):
         start_inclusive (bool, optional) : Whether or not range includes the start value.
         end_inclusive (bool, optional) : Whether or not range includes the end value
     """
-    type_string = "timedelta"
     _default_pandas_dtype = np.timedelta64
 
     def __init__(self,
@@ -320,7 +315,6 @@ def to_data_description(self):
 
 class Text(Variable):
     """Represents variables that are arbitary strings"""
-    type_string = "text"
     _default_pandas_dtype = str
 
 
@@ -338,45 +332,39 @@ class LatLong(Variable):
     To make a latlong in a dataframe do
     data['latlong'] = data[['latitude', 'longitude']].apply(tuple, axis=1)
     """
-    type_string = "latlong"
 
 
 class ZIPCode(Categorical):
     """Represents a postal address in the United States.
     Consists of a series of digits which are casts as
     string. Five digit and 9 digit zipcodes are supported.
     """
-    type_string = "zipcode"
     _default_pandas_dtype = str
 
 
 class IPAddress(Variable):
     """Represents a computer network address. Represented
     in dotted-decimal notation. IPv4 and IPv6 are supported.
     """
-    type_string = "ip_address"
     _default_pandas_dtype = str
 
 
 class FullName(Variable):
     """Represents a person's full name. May consist of a
     first name, last name, and a title.
     """
-    type_string = "full_name"
     _default_pandas_dtype = str
 
 
 class EmailAddress(Variable):
     """Represents an email box to which email message are sent.
     Consists of a local-part, an @ symbol, and a domain.
     """
-    type_string = "email_address"
     _default_pandas_dtype = str
 
 
 class URL(Variable):
     """Represents a valid web url (with or without http/www)"""
-    type_string = "url"
     _default_pandas_dtype = str
 
 
@@ -385,13 +373,11 @@ class PhoneNumber(Variable):
     Can be with/without parenthesis.
     Can be with/without area/country codes.
     """
-    type_string = "phone_number"
     _default_pandas_dtype = str
 
 
 class DateOfBirth(Datetime):
     """Represents a date of birth as a datetime"""
-    type_string = "date_of_birth"
     _default_pandas_dtype = np.datetime64
 
 
@@ -401,7 +387,6 @@ class CountryCode(Categorical):
     should be in the Alpha-2 format.
     e.g. United States of America = US
     """
-    type_string = "country_code"
     _default_pandas_dtype = str
 
 
@@ -411,19 +396,17 @@ class SubRegionCode(Categorical):
     should be in the Alpha-2 format.
     e.g. United States of America, Arizona = US-AZ
     """
-    type_string = "sub_region_code"
     _default_pandas_dtype = str
 
 
 class FilePath(Variable):
     """Represents a valid filepath, absolute or relative"""
-    type_string = "file_path"
     _default_pandas_dtype = str
 
 
 def find_variable_types():
     return {vtype.type_string: vtype for vtype in find_descendents(Variable)
-            if issubclass(vtype, Variable) and vtype.type_string}
+            if vtype != Variable}
 
 
 DEFAULT_DTYPE_VALUES = {