from woodwork.logical_types import Boolean, BooleanNullable
[docs]def remove_highly_null_features(feature_matrix, features=None, pct_null_threshold=0.95):
"""
Removes columns from a feature matrix that have higher than a set threshold
of null values.
Args:
feature_matrix (:class:`pd.DataFrame`): DataFrame whose columns are feature names and rows are instances.
features (list[:class:`featuretools.FeatureBase`] or list[str], optional): List of features to select.
pct_null_threshold (float): If the percentage of NaN values in an input feature exceeds this amount,
that feature will be considered highly-null. Defaults to 0.95.
Returns:
pd.DataFrame, list[:class:`.FeatureBase`]:
The feature matrix and the list of generated feature definitions. Matches dfs output.
If no feature list is provided as input, the feature list will not be returned.
"""
if pct_null_threshold < 0 or pct_null_threshold > 1:
raise ValueError("pct_null_threshold must be a float between 0 and 1, inclusive.")
percent_null_by_col = (feature_matrix.isnull().mean()).to_dict()
if pct_null_threshold == 0.0:
keep = [f_name for f_name, pct_null in percent_null_by_col.items()
if pct_null <= pct_null_threshold]
else:
keep = [f_name for f_name, pct_null in percent_null_by_col.items()
if pct_null < pct_null_threshold]
return _apply_feature_selection(keep, feature_matrix, features)
[docs]def remove_single_value_features(feature_matrix, features=None, count_nan_as_value=False):
"""Removes columns in feature matrix where all the values are the same.
Args:
feature_matrix (:class:`pd.DataFrame`): DataFrame whose columns are feature names and rows are instances.
features (list[:class:`featuretools.FeatureBase`] or list[str], optional): List of features to select.
count_nan_as_value (bool): If True, missing values will be counted as their own unique value.
If set to False, a feature that has one unique value and all other
data missing will be removed from the feature matrix. Defaults to False.
Returns:
pd.DataFrame, list[:class:`.FeatureBase`]:
The feature matrix and the list of generated feature definitions.
Matches dfs output.
If no feature list is provided as input, the feature list will not be returned.
"""
unique_counts_by_col = feature_matrix.nunique(dropna=not count_nan_as_value).to_dict()
keep = [f_name for f_name, unique_count
in unique_counts_by_col.items() if unique_count > 1]
return _apply_feature_selection(keep, feature_matrix, features)
def _apply_feature_selection(keep, feature_matrix, features=None):
new_matrix = feature_matrix[keep]
new_feature_names = set(new_matrix.columns)
if features is not None:
new_features = []
for f in features:
if f.number_output_features > 1:
slices = [f[i] for i in range(f.number_output_features)
if f[i].get_name() in new_feature_names]
if len(slices) == f.number_output_features:
new_features.append(f)
else:
new_features.extend(slices)
else:
if f.get_name() in new_feature_names:
new_features.append(f)
return new_matrix, new_features
return new_matrix