Source code for featuretools.variable_types.variable

import numpy as np
import pandas as pd

from featuretools.utils.gen_utils import camel_to_snake


class ClassNameDescriptor(object):
    """Descriptor to convert a class's name from camelcase to snakecase
    """

    def __get__(self, instance, class_):
        return camel_to_snake(class_.__name__)


class Variable(object):
    """Represent a variable in an entity

    A Variable is analogous to a column in table in a relational database

    Args:
        id (str) : Id of variable. Must match underlying data in Entity
            it belongs to.
        entity (:class:`.Entity`) : Entity this variable belongs to.
        name (str, optional) : Variable name. Defaults to id.

    See Also:
        :class:`.Entity`, :class:`.Relationship`, :class:`.BaseEntitySet`
    """
    type_string = ClassNameDescriptor()
    _default_pandas_dtype = object

    def __init__(self, id, entity, name=None):
        assert isinstance(id, str), "Variable id must be a string"
        self.id = id
        self._name = name
        self.entity_id = entity.id
        assert entity.entityset is not None, "Entity must contain reference to EntitySet"
        self.entity = entity
        if self.id not in self.entity.df:
            default_dtype = self._default_pandas_dtype
            if default_dtype == np.datetime64:
                default_dtype = 'datetime64[ns]'
            if default_dtype == np.timedelta64:
                default_dtype = 'timedelta64[ns]'
        else:
            default_dtype = self.entity.df[self.id].dtype
        self._interesting_values = pd.Series(dtype=default_dtype)

    @property
    def entityset(self):
        return self.entity.entityset

    def __eq__(self, other, deep=False):
        shallow_eq = isinstance(other, self.__class__) and \
            self.id == other.id and \
            self.entity_id == other.entity_id
        if not deep:
            return shallow_eq
        else:
            return shallow_eq and set(self.interesting_values.values) == set(other.interesting_values.values)

    def __hash__(self):
        return hash((self.id, self.entity_id))

    def __repr__(self):
        return u"<Variable: {} (dtype = {})>".format(self.name, self.type_string)

    @classmethod
    def create_from(cls, variable):
        """Create new variable this type from existing

        Args:
            variable (Variable) : Existing variable to create from.

        Returns:
            :class:`.Variable` : new variable

        """
        v = cls(id=variable.id, name=variable.name, entity=variable.entity)
        return v

    @property
    def name(self):
        return self._name if self._name is not None else self.id

    @property
    def dtype(self):
        return self.type_string \
            if self.type_string is not None else "generic_type"

    @name.setter
    def name(self, name):
        self._name = name

    @property
    def interesting_values(self):
        return self._interesting_values

    @interesting_values.setter
    def interesting_values(self, interesting_values):
        self._interesting_values = pd.Series(interesting_values,
                                             dtype=self._interesting_values.dtype)

    @property
    def series(self):
        return self.entity.df[self.id]

    def to_data_description(self):
        return {
            'id': self.id,
            'type': {
                'value': self.type_string,
            },
            'properties': {
                'name': self.name,
                'entity': self.entity.id,
                'interesting_values': self._interesting_values.to_json()
            },
        }


class Unknown(Variable):
    pass


class Discrete(Variable):
    """Superclass representing variables that take on discrete values"""

    def __init__(self, id, entity, name=None):
        super(Discrete, self).__init__(id, entity, name)

    @property
    def interesting_values(self):
        return self._interesting_values

    @interesting_values.setter
    def interesting_values(self, values):
        seen = set()
        seen_add = seen.add
        self._interesting_values = pd.Series([v for v in values if not
                                              (v in seen or seen_add(v))],
                                             dtype=self._interesting_values.dtype)


[docs]class Boolean(Variable):
    """Represents variables that take on one of two values

    Args:
        true_values (list) : List of valued true values. Defaults to [1, True, "true", "True", "yes", "t", "T"]
        false_values (list): List of valued false values. Defaults to [0, False, "false", "False", "no", "f", "F"]
    """
    _default_pandas_dtype = bool

[docs]    def __init__(self,
                 id,
                 entity,
                 name=None,
                 true_values=None,
                 false_values=None):
        default = [1, True, "true", "True", "yes", "t", "T"]
        self.true_values = true_values or default
        default = [0, False, "false", "False", "no", "f", "F"]
        self.false_values = false_values or default
        super(Boolean, self).__init__(id, entity, name=name)

    def to_data_description(self):
        description = super(Boolean, self).to_data_description()
        description['type'].update({
            'true_values': self.true_values,
            'false_values': self.false_values
        })
        return description


[docs]class Categorical(Discrete):
    """Represents variables that can take an unordered discrete values

    Args:
        categories (list) : List of categories. If left blank, inferred from data.
    """

[docs]    def __init__(self, id, entity, name=None, categories=None):
        self.categories = None or []
        super(Categorical, self).__init__(id, entity, name=name)

    def to_data_description(self):
        description = super(Categorical, self).to_data_description()
        description['type'].update({'categories': self.categories})
        return description


[docs]class Id(Categorical):
    """Represents variables that identify another entity"""
    _default_pandas_dtype = int


[docs]class Ordinal(Discrete):
    """Represents variables that take on an ordered discrete value"""
    _default_pandas_dtype = int


[docs]class Numeric(Variable):
    """Represents variables that contain numeric values

    Args:
        range (list, optional) : List of start and end. Can use inf and -inf to represent infinity. Unconstrained if not specified.
        start_inclusive (bool, optional) : Whether or not range includes the start value.
        end_inclusive (bool, optional) : Whether or not range includes the end value

    Attributes:
        max (float)
        min (float)
        std (float)
        mean (float)
    """
    _default_pandas_dtype = float

[docs]    def __init__(self,
                 id,
                 entity,
                 name=None,
                 range=None,
                 start_inclusive=True,
                 end_inclusive=False):
        self.range = None or []
        self.start_inclusive = start_inclusive
        self.end_inclusive = end_inclusive
        super(Numeric, self).__init__(id, entity, name=name)

    def to_data_description(self):
        description = super(Numeric, self).to_data_description()
        description['type'].update({
            'range': self.range,
            'start_inclusive': self.start_inclusive,
            'end_inclusive': self.end_inclusive,
        })
        return description


[docs]class Index(Variable):
    """Represents variables that uniquely identify an instance of an entity

    Attributes:
        count (int)
    """
    _default_pandas_dtype = int


[docs]class Datetime(Variable):
    """Represents variables that are points in time

    Args:
        format (str): Python datetime format string documented `here <http://strftime.org/>`_.
    """
    _default_pandas_dtype = np.datetime64

[docs]    def __init__(self, id, entity, name=None, format=None):
        self.format = format
        super(Datetime, self).__init__(id, entity, name=name)

    def __repr__(self):
        return u"<Variable: {} (dtype: {}, format: {})>".format(self.name, self.type_string, self.format)

    def to_data_description(self):
        description = super(Datetime, self).to_data_description()
        description['type'].update({'format': self.format})
        return description


[docs]class TimeIndex(Variable):
    """Represents time index of entity"""
    _default_pandas_dtype = np.datetime64


[docs]class NumericTimeIndex(TimeIndex, Numeric):
    """Represents time index of entity that is numeric"""
    _default_pandas_dtype = float


[docs]class DatetimeTimeIndex(TimeIndex, Datetime):
    """Represents time index of entity that is a datetime"""
    _default_pandas_dtype = np.datetime64


class Timedelta(Variable):
    """Represents variables that are timedeltas

    Args:
        range (list, optional) : List of start and end of allowed range in seconds. Can use inf and -inf to represent infinity. Unconstrained if not specified.
        start_inclusive (bool, optional) : Whether or not range includes the start value.
        end_inclusive (bool, optional) : Whether or not range includes the end value
    """
    _default_pandas_dtype = np.timedelta64

    def __init__(self,
                 id,
                 entity,
                 name=None,
                 range=None,
                 start_inclusive=True,
                 end_inclusive=False):
        self.range = range or []
        self.start_inclusive = start_inclusive
        self.end_inclusive = end_inclusive
        super(Timedelta, self).__init__(id, entity, name=name)

    def to_data_description(self):
        description = super(Timedelta, self).to_data_description()
        description['type'].update({
            'range': self.range,
            'start_inclusive': self.start_inclusive,
            'end_inclusive': self.end_inclusive,
        })
        return description


[docs]class Text(Variable):
    """Represents variables that are arbitary strings"""
    _default_pandas_dtype = str


class PandasTypes(object):
    _all = 'all'
    _categorical = 'category'
    _pandas_datetimes = ['datetime64[ns]', 'datetime64[ns, tz]']
    _pandas_timedeltas = ['Timedelta']
    _pandas_numerics = ['int16', 'int32', 'int64',
                        'float16', 'float32', 'float64']


[docs]class LatLong(Variable):
    """Represents an ordered pair (Latitude, Longitude)
    To make a latlong in a dataframe do
    data['latlong'] = data[['latitude', 'longitude']].apply(tuple, axis=1)
    """


[docs]class ZIPCode(Categorical):
    """Represents a postal address in the United States.
    Consists of a series of digits which are casts as
    string. Five digit and 9 digit zipcodes are supported.
    """
    _default_pandas_dtype = str


[docs]class IPAddress(Variable):
    """Represents a computer network address. Represented
    in dotted-decimal notation. IPv4 and IPv6 are supported.
    """
    _default_pandas_dtype = str


[docs]class FullName(Variable):
    """Represents a person's full name. May consist of a
    first name, last name, and a title.
    """
    _default_pandas_dtype = str


[docs]class EmailAddress(Variable):
    """Represents an email box to which email message are sent.
    Consists of a local-part, an @ symbol, and a domain.
    """
    _default_pandas_dtype = str


[docs]class URL(Variable):
    """Represents a valid web url (with or without http/www)"""
    _default_pandas_dtype = str


[docs]class PhoneNumber(Variable):
    """Represents any valid phone number.
    Can be with/without parenthesis.
    Can be with/without area/country codes.
    """
    _default_pandas_dtype = str


[docs]class DateOfBirth(Datetime):
    """Represents a date of birth as a datetime"""
    _default_pandas_dtype = np.datetime64


[docs]class CountryCode(Categorical):
    """Represents an ISO-3166 standard country code.
    ISO 3166-1 (countries) are supported. These codes
    should be in the Alpha-2 format.
    e.g. United States of America = US
    """
    _default_pandas_dtype = str


[docs]class SubRegionCode(Categorical):
    """Represents an ISO-3166 standard sub-region code.
    ISO 3166-2 codes (sub-regions are supported. These codes
    should be in the Alpha-2 format.
    e.g. United States of America, Arizona = US-AZ
    """
    _default_pandas_dtype = str


[docs]class FilePath(Variable):
    """Represents a valid filepath, absolute or relative"""
    _default_pandas_dtype = str


DEFAULT_DTYPE_VALUES = {
    np.datetime64: pd.Timestamp.now(),
    int: 0,
    float: 0.1,
    np.timedelta64: pd.Timedelta('1d'),
    object: 'object',
    bool: True,
    str: 'test'
}