import json
import featuretools as ft
[docs]def describe_feature(
feature,
feature_descriptions=None,
primitive_templates=None,
metadata_file=None,
):
"""Generates an English language description of a feature.
Args:
feature (FeatureBase) : Feature to describe
feature_descriptions (dict, optional) : dictionary mapping features or unique
feature names to custom descriptions
primitive_templates (dict, optional) : dictionary mapping primitives or
primitive names to description templates
metadata_file (str, optional) : path to json metadata file
Returns:
str : English description of the feature
"""
feature_descriptions = feature_descriptions or {}
primitive_templates = primitive_templates or {}
if metadata_file:
file_feature_descriptions, file_primitive_templates = parse_json_metadata(
metadata_file,
)
feature_descriptions = {**file_feature_descriptions, **feature_descriptions}
primitive_templates = {**file_primitive_templates, **primitive_templates}
description = generate_description(
feature,
feature_descriptions,
primitive_templates,
)
return description[:1].upper() + description[1:] + "."
def generate_description(feature, feature_descriptions, primitive_templates):
# Check if feature has custom description
if feature in feature_descriptions or feature.unique_name() in feature_descriptions:
description = feature_descriptions.get(feature) or feature_descriptions.get(
feature.unique_name(),
)
return description
# Check if identity feature:
if isinstance(feature, ft.IdentityFeature):
description = feature.column_schema.description
if description is None:
description = 'the "{}"'.format(feature.column_name)
return description
# Handle direct features
if isinstance(feature, ft.DirectFeature):
base_feature, direct_description = get_direct_description(feature)
direct_base = generate_description(
base_feature,
feature_descriptions,
primitive_templates,
)
return direct_base + direct_description
# Get input descriptions
input_descriptions = []
input_columns = feature.base_features
if isinstance(feature, ft.feature_base.FeatureOutputSlice):
input_columns = feature.base_feature.base_features
for input_col in input_columns:
col_description = generate_description(
input_col,
feature_descriptions,
primitive_templates,
)
input_descriptions.append(col_description)
# Remove groupby description from input columns
groupby_description = None
if isinstance(feature, ft.GroupByTransformFeature):
groupby_description = input_descriptions.pop()
# Generate primitive description
template_override = None
if (
feature.primitive in primitive_templates
or feature.primitive.name in primitive_templates
):
template_override = primitive_templates.get(
feature.primitive,
) or primitive_templates.get(feature.primitive.name)
slice_num = feature.n if hasattr(feature, "n") else None
primitive_description = feature.primitive.get_description(
input_descriptions,
slice_num=slice_num,
template_override=template_override,
)
if isinstance(feature, ft.feature_base.FeatureOutputSlice):
feature = feature.base_feature
# Generate groupby phrase if applicable
groupby = ""
if isinstance(feature, ft.AggregationFeature):
groupby_description = get_aggregation_groupby(feature, feature_descriptions)
if groupby_description is not None:
if groupby_description.startswith("the "):
groupby_description = groupby_description[4:]
groupby = "for each {}".format(groupby_description)
# Generate aggregation dataframe phrase with use_previous
dataframe_description = ""
if isinstance(feature, ft.AggregationFeature):
if feature.use_previous:
dataframe_description = "of the previous {} of ".format(
feature.use_previous.get_name().lower(),
)
else:
dataframe_description = "of all instances of "
dataframe_description += '"{}"'.format(
feature.relationship_path[-1][1].child_dataframe.ww.name,
)
# Generate where phrase
where = ""
if hasattr(feature, "where") and feature.where:
where_col = generate_description(
feature.where.base_features[0],
feature_descriptions,
primitive_templates,
)
where = "where {} is {}".format(where_col, feature.where.primitive.value)
# Join all parts of template
description_template = [
primitive_description,
dataframe_description,
where,
groupby,
]
description = " ".join([phrase for phrase in description_template if phrase != ""])
return description
def get_direct_description(feature):
direct_description = (
' the instance of "{}" associated with this '
'instance of "{}"'.format(
feature.relationship_path[-1][1].parent_dataframe.ww.name,
feature.dataframe_name,
)
)
base_features = feature.base_features
# shortens stacked direct features to make it easier to understand
while isinstance(base_features[0], ft.DirectFeature):
base_feat = base_features[0]
base_feat_description = ' the instance of "{}" associated ' "with".format(
base_feat.relationship_path[-1][1].parent_dataframe.ww.name,
)
direct_description = base_feat_description + direct_description
base_features = base_feat.base_features
direct_description = " for" + direct_description
return base_features[0], direct_description
def get_aggregation_groupby(feature, feature_descriptions=None):
if feature_descriptions is None:
feature_descriptions = {}
groupby_name = feature.dataframe.ww.index
groupby = ft.IdentityFeature(
feature.entityset[feature.dataframe_name].ww[groupby_name],
)
if groupby in feature_descriptions or groupby.unique_name() in feature_descriptions:
return feature_descriptions.get(groupby) or feature_descriptions.get(
groupby.unique_name(),
)
else:
return '"{}" in "{}"'.format(groupby_name, feature.dataframe_name)
def parse_json_metadata(file):
with open(file) as f:
json_metadata = json.load(f)
return (
json_metadata.get("feature_descriptions", {}),
json_metadata.get("primitive_templates", {}),
)