Source code for featuretools.primitives.standard.aggregation.num_consecutive_greater_mean

import numpy as np
from woodwork.column_schema import ColumnSchema
from woodwork.logical_types import IntegerNullable

from featuretools.primitives.base import AggregationPrimitive


[docs]class NumConsecutiveGreaterMean(AggregationPrimitive):
    """Determines the length of the longest subsequence above the mean.

    Description:
        Given a list of numbers, find the longest subsequence of numbers
        larger than the mean of the entire sequence. Return the length
        of the longest subsequence.

    Args:
        skipna (bool): If this is False and any value in x is `NaN`, then
            the result will be `NaN`. If True, `NaN` values are skipped.
            Default is True.

    Examples:
        >>> num_consecutive_greater_mean = NumConsecutiveGreaterMean()
        >>> num_consecutive_greater_mean([1, 2, 3, 4, 5, 6])
        3.0

        We can also control the way `NaN` values are handled.

        >>> num_consecutive_greater_mean = NumConsecutiveGreaterMean(skipna=False)
        >>> num_consecutive_greater_mean([1, 2, 3, 4, 5, 6, None])
        nan
    """

    name = "num_consecutive_greater_mean"
    input_types = [ColumnSchema(semantic_tags={"numeric"})]
    return_type = ColumnSchema(logical_type=IntegerNullable, semantic_tags={"numeric"})
    stack_on_self = False
    default_value = 0

[docs]    def __init__(self, skipna=True):
        self.skipna = skipna

    def get_function(self):
        def num_consecutive_greater_mean(x):
            # check for NaN cases
            if x.isnull().all():
                return np.nan
            if not self.skipna and x.isnull().values.any():
                return np.nan
            x_mean = x.mean()

            # In some cases, the mean of x may be NaN
            #   (such as when x has both inf and -inf values)
            if np.isnan(x.mean()):
                return np.nan

            # Find indices of points at or below mean
            x = x.dropna().reset_index(drop=True)
            below_mean_indices = x[x <= x_mean].index.to_series()

            # If none of x is below the mean, return the length of x
            if below_mean_indices.empty:
                return len(x)

            # Pad index with start/end values, in case the longest
            #   sequence occurs at the beginning or end of x
            below_mean_indices[-1] = -1
            below_mean_indices[len(x)] = len(x)
            below_mean_indices = below_mean_indices.sort_index()

            # Calculate gaps between points below mean
            below_mean_indices_shifted = below_mean_indices.shift(1)
            diffs = below_mean_indices - below_mean_indices_shifted

            # Take biggest gap, and subtract 1 to get result
            max_gap = (diffs).max() - 1
            return max_gap

        return num_consecutive_greater_mean
Table of Contents

Quick search

Source code for featuretools.primitives.standard.aggregation.num_consecutive_greater_mean