Source code for featuretools_sklearn_transformer.transformer
import numpy as np
import pandas as pd
from featuretools.computational_backends import calculate_feature_matrix
from featuretools.synthesis import dfs
from sklearn.base import TransformerMixin
[docs]class DFSTransformer(TransformerMixin):
"""Transformer using Scikit-Learn interface for Pipeline uses.
"""
[docs] def __init__(self,
entities=None,
relationships=None,
entityset=None,
target_entity=None,
agg_primitives=None,
trans_primitives=None,
allowed_paths=None,
max_depth=2,
ignore_entities=None,
ignore_variables=None,
seed_features=None,
drop_contains=None,
drop_exact=None,
where_primitives=None,
max_features=-1,
verbose=False):
"""Creates Transformer
Args:
entities (dict[str -> tuple(pd.DataFrame, str, str)]): Dictionary
of entities. Entries take the format
{entity id -> (dataframe, id column, (time_column))}.
relationships (list[(str, str, str, str)]): List of relationships
between entities. List items are a tuple with the format
(parent entity id, parent variable, child entity id, child
variable).
entityset (EntitySet): An already initialized entityset. Required
if entities and relationships are not defined.
target_entity (str): Entity id of entity on which to make
predictions.
agg_primitives (list[str or AggregationPrimitive], optional): List
of Aggregation Feature types to apply.
Default: ["sum", "std", "max", "skew", "min", "mean",
"count", "percent_true", "num_unique", "mode"]
trans_primitives (list[str or TransformPrimitive], optional):
List of Transform Feature functions to apply.
Default: ["day", "year", "month", "weekday", "haversine",
"num_words", "num_characters"]
allowed_paths (list[list[str]]): Allowed entity paths on which to
make features.
max_depth (int) : Maximum allowed depth of features.
ignore_entities (list[str], optional): List of entities to
blacklist when creating features.
ignore_variables (dict[str -> list[str]], optional): List of
specific variables within each entity to blacklist when
creating features.
seed_features (list[:class:`.FeatureBase`]): List of manually
defined features to use.
drop_contains (list[str], optional): Drop features
that contains these strings in name.
drop_exact (list[str], optional): Drop features that
exactly match these strings in name.
where_primitives (list[str or PrimitiveBase], optional):
List of Primitives names (or types) to apply with where
clauses.
Default:
["count"]
max_features (int, optional) : Cap the number of generated features
to this number. If -1, no limit.
Example:
.. ipython:: python
import featuretools as ft
import pandas as pd
from featuretools.wrappers import DFSTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
# Get examle data
n_customers = 3
es = ft.demo.load_mock_customer(return_entityset=True, n_customers=5)
y = [True, False, True]
# Build dataset
pipeline = Pipeline(steps=[
('ft', DFSTransformer(entityset=es,
target_entity="customers",
max_features=3)),
('et', ExtraTreesClassifier(n_estimators=100))
])
# Fit and predict
pipeline.fit([1, 2, 3], y=y) # fit on first 3 customers
pipeline.predict_proba([4,5]) # predict probability of each class on last 2
pipeline.predict([4,5]) # predict on last 2
# Same as above, but using cutoff times
ct = pd.DataFrame()
ct['customer_id'] = [1, 2, 3, 4, 5]
ct['time'] = pd.to_datetime(['2014-1-1 04:00',
'2014-1-2 17:20',
'2014-1-4 09:53',
'2014-1-4 13:48',
'2014-1-5 15:32'])
pipeline.fit(ct.head(3), y=y)
pipeline.predict_proba(ct.tail(2))
pipeline.predict(ct.tail(2))
"""
self.feature_defs = []
self.entities = entities
self.relationships = relationships
self.entityset = entityset
self.target_entity = target_entity
self.agg_primitives = agg_primitives
self.trans_primitives = trans_primitives
self.allowed_paths = allowed_paths
self.max_depth = max_depth
self.ignore_entities = ignore_entities
self.ignore_variables = ignore_variables
self.seed_features = seed_features
self.drop_contains = drop_contains
self.drop_exact = drop_exact
self.where_primitives = where_primitives
self.max_features = max_features
self.verbose = verbose
def fit(self, cuttof_time_ids, y=None):
"""Wrapper for DFS
Calculates a feature matrix and features given a dictionary of
entities and a list of relationships.
Args:
cuttof_time_ids (list | DataFrame): Instances filtered to
calculate features on.
See Also:
:func:`synthesis.dfs`
"""
if isinstance(cuttof_time_ids, (list, np.ndarray, pd.Series)):
self.feature_defs = dfs(entities=self.entities,
relationships=self.relationships,
entityset=self.entityset,
target_entity=self.target_entity,
instance_ids=cuttof_time_ids,
agg_primitives=self.agg_primitives,
trans_primitives=self.trans_primitives,
allowed_paths=self.allowed_paths,
max_depth=self.max_depth,
ignore_entities=self.ignore_entities,
ignore_variables=self.ignore_variables,
seed_features=self.seed_features,
drop_contains=self.drop_contains,
drop_exact=self.drop_exact,
where_primitives=self.where_primitives,
max_features=self.max_features,
features_only=True,
verbose=self.verbose)
elif isinstance(cuttof_time_ids, pd.DataFrame):
self.feature_defs = dfs(entities=self.entities,
relationships=self.relationships,
entityset=self.entityset,
target_entity=self.target_entity,
cutoff_time=cuttof_time_ids,
agg_primitives=self.agg_primitives,
trans_primitives=self.trans_primitives,
allowed_paths=self.allowed_paths,
max_depth=self.max_depth,
ignore_entities=self.ignore_entities,
ignore_variables=self.ignore_variables,
seed_features=self.seed_features,
drop_contains=self.drop_contains,
drop_exact=self.drop_exact,
where_primitives=self.where_primitives,
max_features=self.max_features,
features_only=True,
verbose=self.verbose)
else:
raise TypeError('instance_ids must be a list, np.ndarray, pd.Series, or pd.DataFrame')
return self
def transform(self, cuttof_time_ids):
"""Wrapper for calculate_feature_matix
Calculates a matrix for a given set of instance ids and calculation
times.
Args:
cuttof_time_ids (list | DataFrame): Instances filtered to
calculate features on.
See Also:
:func:`computational_backends.calculate_feature_matrix`
"""
if isinstance(cuttof_time_ids, (list, np.ndarray, pd.Series)):
X_transformed = calculate_feature_matrix(
features=self.feature_defs,
entityset=self.entityset,
instance_ids=cuttof_time_ids,
entities=self.entities,
relationships=self.relationships,
verbose=self.verbose)
X_transformed = X_transformed.loc[cuttof_time_ids]
elif isinstance(cuttof_time_ids, pd.DataFrame):
ct = cuttof_time_ids
X_transformed = calculate_feature_matrix(
features=self.feature_defs,
entityset=self.entityset,
cutoff_time=cuttof_time_ids,
entities=self.entities,
relationships=self.relationships,
verbose=self.verbose)
X_transformed = X_transformed.loc[ct[ct.columns[0]]]
else:
raise TypeError('instance_ids must be a list or pd.DataFrame')
return X_transformed
def get_params(self, deep=True):
out = {
'entityset': self.entityset,
'target_entity': self.target_entity,
'entities': self.entities,
'relationships': self.relationships,
'agg_primitives': self.agg_primitives,
'trans_primitives': self.trans_primitives,
'allowed_paths': self.allowed_paths,
'max_depth': self.max_depth,
'ignore_entities': self.ignore_entities,
'ignore_variables': self.ignore_variables,
'seed_features': self.seed_features,
'drop_contains': self.drop_contains,
'drop_exact': self.drop_exact,
'where_primitives': self.where_primitives,
'max_features': self.max_features,
'verbose': self.verbose,
}
return out