NOTICE

The upcoming release of Featuretools 1.0.0 contains several breaking changes. Users are encouraged to test this version prior to release:

pip install featuretools==1.0.0rc1

For details on migrating to the new version, refer to Transitioning to Featuretools Version 1.0. Please report any issues in the Featuretools GitHub repo or by messaging in Alteryx Open Source Slack.


Source code for featuretools.entityset.entity

import logging
import warnings

import dask.dataframe as dd
import numpy as np
import pandas as pd

from featuretools import variable_types as vtypes
from featuretools.utils.entity_utils import (
    col_is_datetime,
    convert_all_variable_data,
    convert_variable_data,
    get_linked_vars,
    infer_variable_types
)
from featuretools.utils.gen_utils import import_or_none, is_instance
from featuretools.utils.wrangle import _check_time_type, _dataframes_equal
from featuretools.variable_types import Text, find_variable_types

ks = import_or_none('databricks.koalas')

logger = logging.getLogger('featuretools.entityset')

_numeric_types = vtypes.PandasTypes._pandas_numerics
_categorical_types = [vtypes.PandasTypes._categorical]
_datetime_types = vtypes.PandasTypes._pandas_datetimes


[docs]class Entity(object): """Represents an entity in a Entityset, and stores relevant metadata and data An Entity is analogous to a table in a relational database See Also: :class:`.Relationship`, :class:`.Variable`, :class:`.EntitySet` """
[docs] def __init__(self, id, df, entityset, variable_types=None, index=None, time_index=None, secondary_time_index=None, last_time_index=None, already_sorted=False, make_index=False, verbose=False): """ Create Entity Args: id (str): Id of Entity. df (pd.DataFrame): Dataframe providing the data for the entity. entityset (EntitySet): Entityset for this Entity. variable_types (dict[str -> type/str/dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or type_string (str) or (type, kwargs) to pass keyword arguments to the Variable. index (str): Name of id column in the dataframe. time_index (str): Name of time column in the dataframe. secondary_time_index (dict[str -> str]): Dictionary mapping columns in the dataframe to the time index column they are associated with. last_time_index (pd.Series): Time index of the last event for each instance across all child entities. make_index (bool, optional) : If True, assume index does not exist as a column in dataframe, and create a new column of that name using integers the (0, len(dataframe)). Otherwise, assume index exists in dataframe. """ _validate_entity_params(id, df, time_index) created_index, index, df = _create_index(index, make_index, df) self.id = id self.entityset = entityset self.data = {'df': df, 'last_time_index': last_time_index} self.created_index = created_index self._verbose = verbose secondary_time_index = secondary_time_index or {} self._create_variables(variable_types, index, time_index, secondary_time_index) self.df = df[[v.id for v in self.variables]] self.set_index(index) self.time_index = None if time_index: self.set_time_index(time_index, already_sorted=already_sorted) self.set_secondary_time_index(secondary_time_index)
def __repr__(self): repr_out = u"Entity: {}\n".format(self.id) repr_out += u" Variables:" for v in self.variables: repr_out += u"\n {} (dtype: {})".format(v.id, v.type_string) shape = self.shape repr_out += u"\n Shape:\n (Rows: {}, Columns: {})".format( shape[0], shape[1]) return repr_out @property def shape(self): '''Shape of the entity's dataframe''' return self.df.shape def __eq__(self, other, deep=False): if self.index != other.index: return False if self.time_index != other.time_index: return False if self.secondary_time_index != other.secondary_time_index: return False if len(self.variables) != len(other.variables): return False if set(self.variables) != set(other.variables): return False if deep: if self.last_time_index is None and other.last_time_index is not None: return False elif self.last_time_index is not None and other.last_time_index is None: return False elif self.last_time_index is not None and other.last_time_index is not None: if not self.last_time_index.equals(other.last_time_index): return False if not _dataframes_equal(self.df, other.df): return False variables = {variable: (variable, ) for variable in self.variables} for variable in other.variables: variables[variable] += (variable, ) for self_var, other_var in variables.values(): if not self_var.__eq__(other_var, deep=True): return False return True def __sizeof__(self): return sum([value.__sizeof__() for value in self.data.values()]) @property def df(self): '''Dataframe providing the data for the entity.''' return self.data["df"] @df.setter def df(self, _df): self.data["df"] = _df @property def last_time_index(self): ''' Time index of the last event for each instance across all child entities. ''' return self.data["last_time_index"] @last_time_index.setter def last_time_index(self, lti): self.data["last_time_index"] = lti def __hash__(self): return id(self.id) def __getitem__(self, variable_id): return self._get_variable(variable_id) def _get_variable(self, variable_id): """Get variable instance Args: variable_id (str) : Id of variable to get. Returns: :class:`.Variable` : Instance of variable. Raises: RuntimeError : if no variable exist with provided id """ for v in self.variables: if v.id == variable_id: return v raise KeyError("Variable: %s not found in entity" % (variable_id)) @property def variable_types(self): '''Dictionary mapping variable id's to variable types''' return {v.id: type(v) for v in self.variables}
[docs] def convert_variable_type(self, variable_id, new_type, convert_data=True, **kwargs): """Convert variable in dataframe to different type Args: variable_id (str) : Id of variable to convert. new_type (subclass of `Variable`) : Type of variable to convert to. entityset (:class:`.BaseEntitySet`) : EntitySet associated with this entity. convert_data (bool) : If True, convert underlying data in the EntitySet. Raises: RuntimeError : Raises if it cannot convert the underlying data Examples: >>> from featuretools.tests.testing_utils import make_ecommerce_entityset >>> es = make_ecommerce_entityset() >>> es["customers"].convert_variable_type("engagement_level", vtypes.Categorical) """ if convert_data: # first, convert the underlying data (or at least try to) self.df = convert_variable_data(df=self.df, column_id=variable_id, new_type=new_type, **kwargs) # replace the old variable with the new one, maintaining order variable = self._get_variable(variable_id) new_variable = new_type.create_from(variable) self.variables[self.variables.index(variable)] = new_variable
def _create_variables(self, variable_types, index, time_index, secondary_time_index): """Extracts the variables from a dataframe Args: variable_types (dict[str -> types/str/dict[str -> type]]) : An entity's variable_types dict maps string variable ids to types (:class:`.Variable`) or type_strings (str) or (type, kwargs) to pass keyword arguments to the Variable. index (str): Name of index column time_index (str or None): Name of time_index column secondary_time_index (dict[str: [str]]): Dictionary of secondary time columns that each map to a list of columns that depend on that secondary time """ variables = [] variable_types = variable_types.copy() or {} string_to_class_map = find_variable_types() # TODO: Remove once Text has been removed from variable types string_to_class_map[Text.type_string] = Text for vid in variable_types.copy(): vtype = variable_types[vid] if isinstance(vtype, str): if vtype in string_to_class_map: variable_types[vid] = string_to_class_map[vtype] else: variable_types[vid] = string_to_class_map['unknown'] warnings.warn("Variable type {} was unrecognized, Unknown variable type was used instead".format(vtype)) if index not in variable_types: variable_types[index] = vtypes.Index link_vars = get_linked_vars(self) inferred_variable_types = infer_variable_types(self.df, link_vars, variable_types, time_index, secondary_time_index) inferred_variable_types.update(variable_types) for v in inferred_variable_types: # TODO document how vtype can be tuple vtype = inferred_variable_types[v] if isinstance(vtype, tuple): # vtype is (ft.Variable, dict_of_kwargs) _v = vtype[0](v, self, **vtype[1]) else: _v = inferred_variable_types[v](v, self) variables += [_v] # convert data once we've inferred self.df = convert_all_variable_data(df=self.df, variable_types=inferred_variable_types) # make sure index is at the beginning index_variable = [v for v in variables if v.id == index][0] self.variables = [index_variable] + [v for v in variables if v.id != index] def update_data(self, df, already_sorted=False, recalculate_last_time_indexes=True): '''Update entity's internal dataframe, optionaly making sure data is sorted, reference indexes to other entities are consistent, and last_time_indexes are consistent. ''' if len(df.columns) != len(self.variables): raise ValueError("Updated dataframe contains {} columns, expecting {}".format(len(df.columns), len(self.variables))) for v in self.variables: if v.id not in df.columns: raise ValueError("Updated dataframe is missing new {} column".format(v.id)) # Make sure column ordering matches variable ordering self.df = df[[v.id for v in self.variables]] self.set_index(self.index) if self.time_index is not None: self.set_time_index(self.time_index, already_sorted=already_sorted) self.set_secondary_time_index(self.secondary_time_index) if recalculate_last_time_indexes and self.last_time_index is not None: self.entityset.add_last_time_indexes(updated_entities=[self.id]) self.entityset.reset_data_description()
[docs] def add_interesting_values(self, max_values=5, verbose=False): """ Find interesting values for categorical variables, to be used to generate "where" clauses Args: max_values (int) : Maximum number of values per variable to add. verbose (bool) : If True, print summary of interesting values found. Returns: None """ for variable in self.variables: # some heuristics to find basic 'where'-able variables if isinstance(variable, vtypes.Discrete): variable.interesting_values = pd.Series(dtype=variable.entity.df[variable.id].dtype) # TODO - consider removing this constraints # don't add interesting values for entities in relationships skip = False for r in self.entityset.relationships: if variable in [r.child_variable, r.parent_variable]: skip = True break if skip: continue counts = self.df[variable.id].value_counts() # find how many of each unique value there are; sort by count, # and add interesting values to each variable total_count = np.sum(counts) counts[:] = counts.sort_values()[::-1] for i in range(min(max_values, len(counts.index))): idx = counts.index[i] # add the value to interesting_values if it represents more than # 25% of the values we have not seen so far if len(counts.index) < 25: if verbose: msg = "Variable {}: Marking {} as an " msg += "interesting value" logger.info(msg.format(variable.id, idx)) variable.interesting_values = variable.interesting_values.append(pd.Series([idx])) else: fraction = counts[idx] / total_count if fraction > 0.05 and fraction < 0.95: if verbose: msg = "Variable {}: Marking {} as an " msg += "interesting value" logger.info(msg.format(variable.id, idx)) variable.interesting_values = variable.interesting_values.append(pd.Series([idx])) # total_count -= counts[idx] else: break self.entityset.reset_data_description()
def delete_variables(self, variable_ids): """ Remove variables from entity's dataframe and from self.variables Args: variable_ids (list[str]): Variables to delete Returns: None """ # check if variable is not a list if not isinstance(variable_ids, list): raise TypeError('variable_ids must be a list of variable names') if len(variable_ids) == 0: return self.df = self.df.drop(variable_ids, axis=1) for v_id in variable_ids: v = self._get_variable(v_id) self.variables.remove(v) def set_time_index(self, variable_id, already_sorted=False): # check time type if not isinstance(self.df, pd.DataFrame) or self.df.empty: time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[variable_id]._default_pandas_dtype] else: time_to_check = self.df[variable_id].iloc[0] time_type = _check_time_type(time_to_check) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type is None: self.entityset.time_type = time_type elif self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) if is_instance(self.df, (dd, ks), 'DataFrame'): t = time_type # skip checking values already_sorted = True # skip sorting else: t = vtypes.NumericTimeIndex if col_is_datetime(self.df[variable_id]): t = vtypes.DatetimeTimeIndex # use stable sort if not already_sorted: # sort by time variable, then by index self.df = self.df.sort_values([variable_id, self.index]) self.convert_variable_type(variable_id, t, convert_data=False) self.time_index = variable_id def set_index(self, variable_id, unique=True): """ Args: variable_id (string) : Name of an existing variable to set as index. unique (bool) : Whether to assert that the index is unique. """ if isinstance(self.df, pd.DataFrame): self.df = self.df.set_index(self.df[variable_id], drop=False) self.df.index.name = None if unique: assert self.df.index.is_unique, "Index is not unique on dataframe " \ "(Entity {})".format(self.id) self.convert_variable_type(variable_id, vtypes.Index, convert_data=False) self.index = variable_id def set_secondary_time_index(self, secondary_time_index): for time_index, columns in secondary_time_index.items(): if is_instance(self.df, (dd, ks), 'DataFrame') or self.df.empty: time_to_check = vtypes.DEFAULT_DTYPE_VALUES[self[time_index]._default_pandas_dtype] else: time_to_check = self.df[time_index].head(1).iloc[0] time_type = _check_time_type(time_to_check) if time_type is None: raise TypeError("%s time index not recognized as numeric or" " datetime" % (self.id)) if self.entityset.time_type != time_type: raise TypeError("%s time index is %s type which differs from" " other entityset time indexes" % (self.id, time_type)) if time_index not in columns: columns.append(time_index) self.secondary_time_index = secondary_time_index
def _create_index(index, make_index, df): '''Handles index creation logic base on user input''' created_index = None if index is None: # Case 1: user wanted to make index but did not specify column name assert not make_index, "Must specify an index name if make_index is True" # Case 2: make_index not specified but no index supplied, use first column warnings.warn(("Using first column as index. " "To change this, specify the index parameter")) index = df.columns[0] elif make_index and index in df.columns: # Case 3: user wanted to make index but column already exists raise RuntimeError("Cannot make index: index variable already present") elif index not in df.columns: if not make_index: # Case 4: user names index, it is not in df. does not specify # make_index. Make new index column and warn warnings.warn("index {} not found in dataframe, creating new " "integer column".format(index)) # Case 5: make_index with no errors or warnings # (Case 4 also uses this code path) if isinstance(df, dd.DataFrame): df[index] = 1 df[index] = df[index].cumsum() - 1 elif is_instance(df, ks, 'DataFrame'): df = df.koalas.attach_id_column('distributed-sequence', index) else: df.insert(0, index, range(len(df))) created_index = index # Case 6: user specified index, which is already in df. No action needed. return created_index, index, df def _validate_entity_params(id, df, time_index): '''Validation checks for Entity inputs''' assert isinstance(id, str), "Entity id must be a string" assert len(df.columns) == len(set(df.columns)), "Duplicate column names" for c in df.columns: if not isinstance(c, str): raise ValueError("All column names must be strings (Column {} " "is not a string)".format(c)) if time_index is not None and time_index not in df.columns: raise LookupError('Time index not found in dataframe')