Source code for featuretools.utils.recommend_primitives

import logging
from typing import List

from featuretools.computational_backends import calculate_feature_matrix
from featuretools.entityset import EntitySet
from featuretools.primitives.utils import get_transform_primitives
from featuretools.synthesis import dfs, get_valid_primitives

ORDERED_PRIMITIVES = [  # non-numeric primitives that require specific ordering or a time index to be set
    "cum_count",
    "cumulative_time_since_last_false",
    "cumulative_time_since_last_true",
    "diff",
    "diff_datetime",
    "is_first_occurrence",
    "is_last_occurrence",
    "time_since_previous",
]


DEPRECATED_PRIMITIVES = [
    "multiply_boolean",  # functionality duplicated by 'and' primitive
    "numeric_lag",  # deprecated and replaced with `lag`
]

REQUIRED_INPUT_PRIMITIVES = [  # non-numeric primitives that require input
    "count_string",
    "distance_to_holiday",
    "is_in_geobox",
    "not_equal_scalar",
    "equal_scalar",
    "time_since",
    "isin",
]

OTHER_PRIMITIVES_TO_EXCLUDE = [  # Excluding some primitives that can produce too many features or aren't useful in extracting information
    "not",
    "and",
    "or",
    "equal",
    "not_equal",
]

DEFAULT_EXCLUDED_PRIMITIVES = (
    REQUIRED_INPUT_PRIMITIVES
    + DEPRECATED_PRIMITIVES
    + ORDERED_PRIMITIVES
    + OTHER_PRIMITIVES_TO_EXCLUDE
)

# TODO: Make this list more dynamic
TIME_SERIES_PRIMITIVES = [
    "expanding_count",
    "expanding_max",
    "expanding_mean",
    "expanding_min",
    "expanding_std",
    "expanding_trend",
    "lag",
    "rolling_count",
    "rolling_outlier_count",
    "rolling_max",
    "rolling_mean",
    "rolling_min",
    "rolling_std",
    "rolling_trend",
]


# TODO: Support multi-table



def _recommend_non_numeric_primitives(
    entityset: EntitySet,
    target_dataframe_name: str,
    valid_primitives: List[str],
) -> set:
    """Get a set of non-numeric primitives for a given dataset and a list of primitives.

    Description:
        Given a single table entity set with a `target_dataframe_name` and an applicable list of `valid_primitives`,
        get a set of primitives which produce non-unique features.

    Args:
        entityset (EntitySet): EntitySet that only contains one dataframe.
        target_dataframe_name (str): Name of target dataframe to access in `entityset`.
        valid_primitives (List[str]): List of primitives to calculate and check output features.
    """

    recommended_non_numeric_primitives: set[str] = set()
    # Only want to run feature generation on non numeric primitives
    numeric_columns_to_ignore = list(
        entityset[target_dataframe_name]
        .ww.select(include="numeric", return_schema=True)
        .columns,
    )
    features = dfs(
        entityset=entityset,
        target_dataframe_name=target_dataframe_name,
        trans_primitives=valid_primitives,
        max_depth=1,
        features_only=True,
        ignore_columns={target_dataframe_name: numeric_columns_to_ignore},
    )

    for f in features:
        if (
            f.primitive.name is not None
            and f.primitive.name not in recommended_non_numeric_primitives
        ):
            try:
                matrix = calculate_feature_matrix([f], entityset)
                for f_name in f.get_feature_names():
                    if len(matrix[f_name].unique()) > 1:
                        recommended_non_numeric_primitives.add(f.primitive.name)
            except (
                Exception
            ) as e:  # If error in calculating feature matrix pass on the recommendation
                logger = logging.getLogger("featuretools")
                logger.error(
                    f"Exception with feature {f.get_name()} with primitive {f.primitive.name}: {str(e)}",
                )

    return recommended_non_numeric_primitives


def _recommend_skew_numeric_primitives(
    entityset: EntitySet,
    target_dataframe_name: str,
    valid_primitives: List[str],
) -> set:
    """Get a set of recommended skew numeric primitives given an entity set.

    Description:
        Given woodwork initialized dataframe of origin features with only `numeric` semantic tags and an applicable list of `valid_skew_primitives`,
        get a set of primitives which could be applied to address right skewness.

    Args:
        entityset (EntitySet): EntitySet that only contains one dataframe.
        target_dataframe_name (str): Name of target dataframe to access in `entityset`.
        valid_primitives (List[str]): List of primitives to compare.

    Note:
        We currently only have primitives to address right skewness.
    """
    recommended_skew_primitives: set[str] = set()
    skew_numeric_primitives = set(["square_root", "natural_logarithm"])
    valid_skew_primitives = skew_numeric_primitives.intersection(valid_primitives)
    if valid_skew_primitives:
        numerics_only_df = entityset[target_dataframe_name].ww.select("numeric")
        recommended_skew_primitives: set[str] = set()
        for col in numerics_only_df:
            # Shouldn't recommend log, sqrt if nans, zeros and negative numbers are present
            contains_nan = numerics_only_df[col].isnull().any()
            all_above_zero = (numerics_only_df[col] > 0).all()
            if all_above_zero and not contains_nan:
                skew = numerics_only_df[col].skew()
                # We currently don't have anything in featuretools to automatically handle left skewed data as well as skewed data with negative values
                if skew > 0.5 and skew < 1 and "square_root" in valid_skew_primitives:
                    recommended_skew_primitives.add("square_root")
                    # TODO: Add Box Cox here when available
                if skew > 1 and "natural_logarithm" in valid_skew_primitives:
                    recommended_skew_primitives.add("natural_logarithm")
                    # TODO: Add log base 10 transform primitive when available
    return recommended_skew_primitives