Source code for featuretools_sklearn_transformer.transformer

from featuretools.computational_backends import calculate_feature_matrix
from featuretools.synthesis import dfs
from sklearn.base import TransformerMixin


[docs]class DFSTransformer(TransformerMixin): """Transformer using Scikit-Learn interface for Pipeline uses. """
[docs] def __init__(self, target_dataframe_name=None, agg_primitives=None, trans_primitives=None, allowed_paths=None, max_depth=2, ignore_dataframes=None, ignore_columns=None, seed_features=None, drop_contains=None, drop_exact=None, where_primitives=None, max_features=-1, verbose=False): """Creates Transformer Args: target_dataframe_name (str): Name of dataframe on which to make predictions. agg_primitives (list[str or AggregationPrimitive], optional): List of Aggregation Feature types to apply. Default: ["sum", "std", "max", "skew", "min", "mean", "count", "percent_true", "num_unique", "mode"] trans_primitives (list[str or TransformPrimitive], optional): List of Transform Feature functions to apply. Default: ["day", "year", "month", "weekday", "haversine", "num_words", "num_characters"] allowed_paths (list[list[str]]): Allowed dataframe paths on which to make features. max_depth (int) : Maximum allowed depth of features. ignore_dataframes (list[str], optional): List of dataframes to blacklist when creating features. ignore_columns (dict[str -> list[str]], optional): List of specific columns within each dataframe to blacklist when creating features. seed_features (list[:class:`.FeatureBase`]): List of manually defined features to use. drop_contains (list[str], optional): Drop features that contains these strings in name. drop_exact (list[str], optional): Drop features that exactly match these strings in name. where_primitives (list[str or PrimitiveBase], optional): List of Primitives names (or types) to apply with where clauses. Default: ["count"] max_features (int, optional) : Cap the number of generated features to this number. If -1, no limit. Example: .. ipython:: python import featuretools as ft import pandas as pd from featuretools.wrappers import DFSTransformer from sklearn.pipeline import Pipeline from sklearn.ensemble import ExtraTreesClassifier # Get example data train_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=3) test_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=2) y = [True, False, True] # Build pipeline pipeline = Pipeline(steps=[ ('ft', DFSTransformer(target_dataframe_name="customers", max_features=2)), ('et', ExtraTreesClassifier(n_estimators=100)) ]) # Fit and predict pipeline.fit(X=train_es, y=y) # fit on customers in training entityset pipeline.predict_proba(test_es) # predict probability of each class on test entityset pipeline.predict(test_es) # predict on test entityset # Same as above, but using cutoff times train_ct = pd.DataFrame() train_ct['customer_id'] = [1, 2, 3] train_ct['time'] = pd.to_datetime(['2014-1-1 04:00', '2014-1-2 17:20', '2014-1-4 09:53']) pipeline.fit(X=(train_es, train_ct), y=y) test_ct = pd.DataFrame() test_ct['customer_id'] = [1, 2] test_ct['time'] = pd.to_datetime(['2014-1-4 13:48', '2014-1-5 15:32']) pipeline.predict_proba((test_es, test_ct)) pipeline.predict((test_es, test_ct)) """ self.feature_defs = [] self.target_dataframe_name = target_dataframe_name self.agg_primitives = agg_primitives self.trans_primitives = trans_primitives self.allowed_paths = allowed_paths self.max_depth = max_depth self.ignore_dataframes = ignore_dataframes self.ignore_columns = ignore_columns self.seed_features = seed_features self.drop_contains = drop_contains self.drop_exact = drop_exact self.where_primitives = where_primitives self.max_features = max_features self.verbose = verbose
def fit(self, X, y=None): """Wrapper for DFS Calculates a list of features given a dictionary of dataframes and a list of relationships. Alternatively, an EntitySet can be passed instead of the dataframes and relationships. Args: X: (ft.Entityset or tuple): Entityset to calculate features on. If a tuple is passed it can take one of these forms: (entityset, cutoff_time_dataframe), (dataframes, relationships), or ((dataframes, relationships), cutoff_time_dataframe) y: (iterable): Training targets See Also: :func:`synthesis.dfs` """ es, dataframes, relationships, _ = parse_x_input(X) self.feature_defs = dfs(entityset=es, dataframes=dataframes, relationships=relationships, target_dataframe_name=self.target_dataframe_name, agg_primitives=self.agg_primitives, trans_primitives=self.trans_primitives, allowed_paths=self.allowed_paths, max_depth=self.max_depth, ignore_dataframes=self.ignore_dataframes, ignore_columns=self.ignore_columns, seed_features=self.seed_features, drop_contains=self.drop_contains, drop_exact=self.drop_exact, where_primitives=self.where_primitives, max_features=self.max_features, features_only=True, verbose=self.verbose) return self def transform(self, X): """Wrapper for calculate_feature_matrix Calculates a feature matrix for a the given input data and calculation times. Args: X: (ft.Entityset or tuple): Entityset to calculate features on. If a tuple is passed it can take one of these forms: (entityset, cutoff_time_dataframe), (dataframes, relationships), or ((dataframes, relationships), cutoff_time_dataframe) See Also: :func:`computational_backends.calculate_feature_matrix` """ es, dataframes, relationships, cutoff_time = parse_x_input(X) X_transformed = calculate_feature_matrix( features=self.feature_defs, instance_ids=None, cutoff_time=cutoff_time, entityset=es, dataframes=dataframes, relationships=relationships, verbose=self.verbose) return X_transformed def get_params(self, deep=True): out = { 'target_dataframe_name': self.target_dataframe_name, 'agg_primitives': self.agg_primitives, 'trans_primitives': self.trans_primitives, 'allowed_paths': self.allowed_paths, 'max_depth': self.max_depth, 'ignore_dataframes': self.ignore_dataframes, 'ignore_columns': self.ignore_columns, 'seed_features': self.seed_features, 'drop_contains': self.drop_contains, 'drop_exact': self.drop_exact, 'where_primitives': self.where_primitives, 'max_features': self.max_features, 'verbose': self.verbose, } return out
def parse_x_input(X): if isinstance(X, tuple): if isinstance(X[0], tuple): # Input of ((dataframes, relationships), cutoff_time) dataframes = X[0][0] relationships = X[0][1] es = None cutoff_time = X[1] elif isinstance(X[0], dict): # Input of (dataframes, relationships) dataframes = X[0] relationships = X[1] es = None cutoff_time = None else: # Input of (entityset, cutoff_time) es = X[0] dataframes = None relationships = None cutoff_time = X[1] else: # Input of entityset es = X dataframes = None relationships = None cutoff_time = None return es, dataframes, relationships, cutoff_time