NOTICE

The upcoming release of Featuretools 1.0.0 contains several breaking changes. Users are encouraged to test this version prior to release:

pip install featuretools==1.0.0rc1

For details on migrating to the new version, refer to Transitioning to Featuretools Version 1.0. Please report any issues in the Featuretools GitHub repo or by messaging in Alteryx Open Source Slack.


Source code for featuretools.variable_types.variable

import warnings

import numpy as np
import pandas as pd

from featuretools.utils.gen_utils import camel_to_snake


class ClassNameDescriptor(object):
    """Descriptor to convert a class's name from camelcase to snakecase
    """

    def __get__(self, instance, class_):
        return camel_to_snake(class_.__name__)


class Variable(object):
    """Represent a variable in an entity

    A Variable is analogous to a column in table in a relational database

    Args:
        id (str) : Id of variable. Must match underlying data in Entity
            it belongs to.
        entity (:class:`.Entity`) : Entity this variable belongs to.
        name (str, optional) : Variable name. Defaults to id.
        description (str, optional) : Description of what the variable is.
            Used when describing features with `ft.describe_feature`.

    See Also:
        :class:`.Entity`, :class:`.Relationship`, :class:`.BaseEntitySet`
    """
    type_string = ClassNameDescriptor()
    _default_pandas_dtype = object

    def __init__(self, id, entity, name=None, description=None):
        assert isinstance(id, str), "Variable id must be a string"
        self.id = id
        self._name = name
        self.entity_id = entity.id
        self._description = description
        assert entity.entityset is not None, "Entity must contain reference to EntitySet"
        self.entity = entity
        if self.id not in self.entity.df:
            default_dtype = self._default_pandas_dtype
        else:
            default_dtype = self.entity.df[self.id].dtype
        if default_dtype == np.datetime64:
            default_dtype = 'datetime64[ns]'
        if default_dtype == np.timedelta64:
            default_dtype = 'timedelta64[ns]'

        self._interesting_values = pd.Series(dtype=default_dtype)

    @property
    def entityset(self):
        return self.entity.entityset

    def __eq__(self, other, deep=False):
        shallow_eq = isinstance(other, self.__class__) and \
            self.id == other.id and \
            self.entity_id == other.entity_id
        if not deep:
            return shallow_eq
        else:
            return shallow_eq and set(self.interesting_values.values) == set(other.interesting_values.values)

    def __hash__(self):
        return hash((self.id, self.entity_id))

    def __repr__(self):
        return u"<Variable: {} (dtype = {})>".format(self.name, self.type_string)

    @classmethod
    def create_from(cls, variable):
        """Create new variable this type from existing

        Args:
            variable (Variable) : Existing variable to create from.

        Returns:
            :class:`.Variable` : new variable

        """
        v = cls(id=variable.id, name=variable.name, entity=variable.entity)
        return v

    @property
    def name(self):
        return self._name if self._name is not None else self.id

    @property
    def dtype(self):
        return self.type_string \
            if self.type_string is not None else "generic_type"

    @name.setter
    def name(self, name):
        self._name = name

    @property
    def description(self):
        return self._description if self._description is not None else 'the "{}"'.format(self.name)

    @description.setter
    def description(self, description):
        if description != self._description:
            self.entity.entityset.reset_data_description()
        self._description = description

    @property
    def interesting_values(self):
        return self._interesting_values

    @interesting_values.setter
    def interesting_values(self, interesting_values):
        self._interesting_values = pd.Series(interesting_values,
                                             dtype=self._interesting_values.dtype)

    @property
    def series(self):
        return self.entity.df[self.id]

    def to_data_description(self):
        return {
            'id': self.id,
            'type': {
                'value': self.type_string,
            },
            'properties': {
                'name': self.name,
                'description': self.description,
                'entity': self.entity.id,
                'interesting_values': self._interesting_values.to_json()
            },
        }


class Unknown(Variable):
    pass


class Discrete(Variable):
    """Superclass representing variables that take on discrete values"""

    def __init__(self, id, entity, name=None):
        super(Discrete, self).__init__(id, entity, name)

    @property
    def interesting_values(self):
        return self._interesting_values

    @interesting_values.setter
    def interesting_values(self, values):
        seen = set()
        seen_add = seen.add
        self._interesting_values = pd.Series([v for v in values if not
                                              (v in seen or seen_add(v))],
                                             dtype=self._interesting_values.dtype)


[docs]class Boolean(Variable): """Represents variables that take on one of two values Args: true_values (list) : List of valued true values. Defaults to [1, True, "true", "True", "yes", "t", "T"] false_values (list): List of valued false values. Defaults to [0, False, "false", "False", "no", "f", "F"] """ _default_pandas_dtype = bool
[docs] def __init__(self, id, entity, name=None, true_values=None, false_values=None): default = [1, True, "true", "True", "yes", "t", "T"] self.true_values = true_values or default default = [0, False, "false", "False", "no", "f", "F"] self.false_values = false_values or default super(Boolean, self).__init__(id, entity, name=name)
def to_data_description(self): description = super(Boolean, self).to_data_description() description['type'].update({ 'true_values': self.true_values, 'false_values': self.false_values }) return description
[docs]class Categorical(Discrete): """Represents variables that can take an unordered discrete values Args: categories (list) : List of categories. If left blank, inferred from data. """
[docs] def __init__(self, id, entity, name=None, categories=None): self.categories = None or [] super(Categorical, self).__init__(id, entity, name=name)
def to_data_description(self): description = super(Categorical, self).to_data_description() description['type'].update({'categories': self.categories}) return description
[docs]class Id(Categorical): """Represents variables that identify another entity""" _default_pandas_dtype = int
[docs]class Ordinal(Discrete): """Represents variables that take on an ordered discrete value""" _default_pandas_dtype = int
[docs]class Numeric(Variable): """Represents variables that contain numeric values Args: range (list, optional) : List of start and end. Can use inf and -inf to represent infinity. Unconstrained if not specified. start_inclusive (bool, optional) : Whether or not range includes the start value. end_inclusive (bool, optional) : Whether or not range includes the end value Attributes: max (float) min (float) std (float) mean (float) """ _default_pandas_dtype = float
[docs] def __init__(self, id, entity, name=None, range=None, start_inclusive=True, end_inclusive=False): self.range = None or [] self.start_inclusive = start_inclusive self.end_inclusive = end_inclusive super(Numeric, self).__init__(id, entity, name=name)
def to_data_description(self): description = super(Numeric, self).to_data_description() description['type'].update({ 'range': self.range, 'start_inclusive': self.start_inclusive, 'end_inclusive': self.end_inclusive, }) return description
[docs]class Index(Variable): """Represents variables that uniquely identify an instance of an entity Attributes: count (int) """ _default_pandas_dtype = int
[docs]class Datetime(Variable): """Represents variables that are points in time Args: format (str): Python datetime format string documented `here <http://strftime.org/>`_. """ _default_pandas_dtype = np.datetime64
[docs] def __init__(self, id, entity, name=None, format=None): self.format = format super(Datetime, self).__init__(id, entity, name=name)
def __repr__(self): return u"<Variable: {} (dtype: {}, format: {})>".format(self.name, self.type_string, self.format) def to_data_description(self): description = super(Datetime, self).to_data_description() description['type'].update({'format': self.format}) return description
[docs]class TimeIndex(Variable): """Represents time index of entity""" _default_pandas_dtype = np.datetime64
[docs]class NumericTimeIndex(TimeIndex, Numeric): """Represents time index of entity that is numeric""" _default_pandas_dtype = float
[docs]class DatetimeTimeIndex(TimeIndex, Datetime): """Represents time index of entity that is a datetime""" _default_pandas_dtype = np.datetime64
class Timedelta(Variable): """Represents variables that are timedeltas Args: range (list, optional) : List of start and end of allowed range in seconds. Can use inf and -inf to represent infinity. Unconstrained if not specified. start_inclusive (bool, optional) : Whether or not range includes the start value. end_inclusive (bool, optional) : Whether or not range includes the end value """ _default_pandas_dtype = np.timedelta64 def __init__(self, id, entity, name=None, range=None, start_inclusive=True, end_inclusive=False): self.range = range or [] self.start_inclusive = start_inclusive self.end_inclusive = end_inclusive super(Timedelta, self).__init__(id, entity, name=name) def to_data_description(self): description = super(Timedelta, self).to_data_description() description['type'].update({ 'range': self.range, 'start_inclusive': self.start_inclusive, 'end_inclusive': self.end_inclusive, }) return description
[docs]class NaturalLanguage(Variable): """Represents variables that are arbitary strings""" _default_pandas_dtype = str
class Text(NaturalLanguage): def __init__(self, id, entity, name=None): msg = "Text has been deprecated. Please use NaturalLanguage instead." warnings.warn(msg, category=FutureWarning) super(Text, self).__init__(id, entity, name) class PandasTypes(object): _all = 'all' _categorical = 'category' _pandas_datetimes = ['datetime64[ns]', 'datetime64[ns, tz]'] _pandas_timedeltas = ['Timedelta'] _pandas_numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
[docs]class LatLong(Variable): """Represents an ordered pair (Latitude, Longitude) To make a latlong in a dataframe do data['latlong'] = data[['latitude', 'longitude']].apply(tuple, axis=1) """
[docs]class ZIPCode(Categorical): """Represents a postal address in the United States. Consists of a series of digits which are casts as string. Five digit and 9 digit zipcodes are supported. """ _default_pandas_dtype = str
[docs]class IPAddress(Variable): """Represents a computer network address. Represented in dotted-decimal notation. IPv4 and IPv6 are supported. """ _default_pandas_dtype = str
[docs]class FullName(Variable): """Represents a person's full name. May consist of a first name, last name, and a title. """ _default_pandas_dtype = str
[docs]class EmailAddress(Variable): """Represents an email box to which email message are sent. Consists of a local-part, an @ symbol, and a domain. """ _default_pandas_dtype = str
[docs]class URL(Variable): """Represents a valid web url (with or without http/www)""" _default_pandas_dtype = str
[docs]class PhoneNumber(Variable): """Represents any valid phone number. Can be with/without parenthesis. Can be with/without area/country codes. """ _default_pandas_dtype = str
[docs]class DateOfBirth(Datetime): """Represents a date of birth as a datetime""" _default_pandas_dtype = np.datetime64
[docs]class CountryCode(Categorical): """Represents an ISO-3166 standard country code. ISO 3166-1 (countries) are supported. These codes should be in the Alpha-2 format. e.g. United States of America = US """ _default_pandas_dtype = str
[docs]class SubRegionCode(Categorical): """Represents an ISO-3166 standard sub-region code. ISO 3166-2 codes (sub-regions are supported. These codes should be in the Alpha-2 format. e.g. United States of America, Arizona = US-AZ """ _default_pandas_dtype = str
[docs]class FilePath(Variable): """Represents a valid filepath, absolute or relative""" _default_pandas_dtype = str
DEFAULT_DTYPE_VALUES = { np.datetime64: pd.Timestamp.now(), int: 0, float: 0.1, np.timedelta64: pd.Timedelta('1d'), object: 'object', bool: True, str: 'test' }