importimportlib.utilimportosfrominspectimportgetfullargspec,getsource,isclassfromtypingimportDict,Listimportpandasaspdfromwoodworkimportlist_logical_types,list_semantic_tags,type_systemfromwoodwork.column_schemaimportColumnSchemafromwoodwork.logical_typesimportNaturalLanguageimportfeaturetoolsfromfeaturetools.primitivesimportNumberOfCommonWordsfromfeaturetools.primitives.baseimport(AggregationPrimitive,PrimitiveBase,TransformPrimitive,)fromfeaturetools.utils.gen_utilsimportfind_descendentsdef_get_primitives(primitive_kind):"""Helper function that selects all primitives that are instances of `primitive_kind` """primitives=set()forattribute_stringindir(featuretools.primitives):attribute=getattr(featuretools.primitives,attribute_string)ifisclass(attribute):ifissubclass(attribute,primitive_kind)andattribute.name:primitives.add(attribute)return{prim.name.lower():primforpriminprimitives}defget_aggregation_primitives():"""Returns all aggregation primitives"""return_get_primitives(featuretools.primitives.AggregationPrimitive)defget_transform_primitives():"""Returns all transform primitives"""return_get_primitives(featuretools.primitives.TransformPrimitive)defget_all_primitives():"""Helper function to return all primitives"""primitives=set()forattribute_stringindir(featuretools.primitives):attribute=getattr(featuretools.primitives,attribute_string)ifisclass(attribute):ifissubclass(attribute,PrimitiveBase)andattribute.name:primitives.add(attribute)return{prim.__name__:primforpriminprimitives}def_get_natural_language_primitives():"""Returns all Natural Language transform primitives"""transform_primitives=get_transform_primitives()def_natural_language_in_input_type(primitive):forinput_typeinprimitive.input_types:ifisinstance(input_type,list):ifany(isinstance(column_schema.logical_type,NaturalLanguage)forcolumn_schemaininput_type):returnTrueelse:ifisinstance(input_type.logical_type,NaturalLanguage):returnTruereturnFalsereturn{name:primitiveforname,primitiveintransform_primitives.items()if_natural_language_in_input_type(primitive)}
[docs]deflist_primitives():"""Returns a DataFrame that lists and describes each built-in primitive."""trans_names,trans_primitives,valid_inputs,return_type=_get_names_primitives(get_transform_primitives,)transform_df=pd.DataFrame({"name":trans_names,"description":_get_descriptions(trans_primitives),"valid_inputs":valid_inputs,"return_type":return_type,},)transform_df["type"]="transform"agg_names,agg_primitives,valid_inputs,return_type=_get_names_primitives(get_aggregation_primitives,)agg_df=pd.DataFrame({"name":agg_names,"description":_get_descriptions(agg_primitives),"valid_inputs":valid_inputs,"return_type":return_type,},)agg_df["type"]="aggregation"columns=["name","type","description","valid_inputs","return_type",]returnpd.concat([agg_df,transform_df],ignore_index=True)[columns]
[docs]defsummarize_primitives()->pd.DataFrame:"""Returns a metrics summary DataFrame of all primitives found in list_primitives."""(trans_names,trans_primitives,trans_valid_inputs,trans_return_type,)=_get_names_primitives(get_transform_primitives)(agg_names,agg_primitives,agg_valid_inputs,agg_return_type,)=_get_names_primitives(get_aggregation_primitives)tot_trans=len(trans_names)tot_agg=len(agg_names)tot_prims=tot_trans+tot_aggall_primitives=trans_primitives+agg_primitivesprimitives_summary=_get_summary_primitives(all_primitives)summary_dict={"total_primitives":tot_prims,"aggregation_primitives":tot_agg,"transform_primitives":tot_trans,**primitives_summary["general_metrics"],}summary_dict.update({f"uses_{ltype}_input":countforltype,countinprimitives_summary["logical_type_input_metrics"].items()},)summary_dict.update({f"uses_{tag}_tag_input":countfortag,countinprimitives_summary["semantic_tag_metrics"].items()},)summary_df=pd.DataFrame([{"Metric":k,"Count":v}fork,vinsummary_dict.items()],)returnsummary_df
defget_default_aggregation_primitives():agg_primitives=[featuretools.primitives.Sum,featuretools.primitives.Std,featuretools.primitives.Max,featuretools.primitives.Skew,featuretools.primitives.Min,featuretools.primitives.Mean,featuretools.primitives.Count,featuretools.primitives.PercentTrue,featuretools.primitives.NumUnique,featuretools.primitives.Mode,]returnagg_primitivesdefget_default_transform_primitives():# featuretools.primitives.TimeSincetrans_primitives=[featuretools.primitives.Age,featuretools.primitives.Day,featuretools.primitives.Year,featuretools.primitives.Month,featuretools.primitives.Weekday,featuretools.primitives.Haversine,featuretools.primitives.NumWords,featuretools.primitives.NumCharacters,]returntrans_primitivesdef_get_descriptions(primitives):descriptions=[]forpriminprimitives:description=""ifprim.__doc__isnotNone:# Break on the empty line between the docstring description and the remainder of the docstringdescription=prim.__doc__.split("\n\n")[0]# remove any excess whitespace from line breaksdescription=" ".join(description.split())descriptions.append(description)returndescriptionsdef_get_summary_primitives(primitives:List)->Dict[str,int]:"""Provides metrics for a list of primitives."""unique_input_types=set()unique_output_types=set()uses_multi_input=0uses_multi_output=0uses_external_data=0are_controllable=0logical_type_metrics={log_type:0forlog_typeinlist(list_logical_types()["type_string"])}semantic_tag_metrics={sem_tag:0forsem_taginlist(list_semantic_tags()["name"])}semantic_tag_metrics.update({"foreign_key":0},)# not currently in list_semantic_tags()forpriminprimitives:log_in_type_checks=set()sem_tag_type_checks=set()input_types=prim.flatten_nested_input_types(prim.input_types)_check_input_types(input_types,log_in_type_checks,sem_tag_type_checks,unique_input_types,)forltypeinlist(log_in_type_checks):logical_type_metrics[ltype]+=1forsem_taginlist(sem_tag_type_checks):semantic_tag_metrics[sem_tag]+=1iflen(prim.input_types)>1:uses_multi_input+=1# checks if number_output_features is set as an instance variable or set as a constantif("self.number_output_features ="ingetsource(prim.__init__)orprim.number_output_features>1):uses_multi_output+=1unique_output_types.add(str(prim.return_type))ifhasattr(prim,"filename"):uses_external_data+=1iflen(getfullargspec(prim.__init__).args)>1:are_controllable+=1return{"general_metrics":{"unique_input_types":len(unique_input_types),"unique_output_types":len(unique_output_types),"uses_multi_input":uses_multi_input,"uses_multi_output":uses_multi_output,"uses_external_data":uses_external_data,"are_controllable":are_controllable,},"logical_type_input_metrics":logical_type_metrics,"semantic_tag_metrics":semantic_tag_metrics,}def_check_input_types(input_types:List[ColumnSchema],log_in_type_checks:set,sem_tag_type_checks:set,unique_input_types:set,):"""Checks if any logical types or semantic tags occur in a list of Woodwork input types and keeps track of unique input types."""forin_typeininput_types:ifin_type.semantic_tags:forsem_taginin_type.semantic_tags:sem_tag_type_checks.add(sem_tag)ifin_type.logical_type:log_in_type_checks.add(in_type.logical_type.type_string)unique_input_types.add(str(in_type))def_get_names_primitives(primitive_func):names=[]primitives=[]valid_inputs=[]return_type=[]forname,primitiveinprimitive_func().items():names.append(name)primitives.append(primitive)input_types=_get_unique_input_types(primitive.input_types)valid_inputs.append(", ".join(input_types))return_type.append(str(primitive.return_type),)ifprimitive.return_typeisnotNoneelsereturn_type.append(None)returnnames,primitives,valid_inputs,return_typedef_get_unique_input_types(input_types):types=set()forinput_typeininput_types:ifisinstance(input_type,list):types|=_get_unique_input_types(input_type)else:types.add(str(input_type))returntypesdeflist_primitive_files(directory):"""returns list of files in directory that might contain primitives"""files=os.listdir(directory)keep=[]forpathinfiles:ifnotcheck_valid_primitive_path(path):continuekeep.append(os.path.join(directory,path))returnkeepdefcheck_valid_primitive_path(path):ifos.path.isdir(path):returnFalsefilename=os.path.basename(path)iffilename[:2]=="__"orfilename[0]=="."orfilename[-3:]!=".py":returnFalsereturnTruedefload_primitive_from_file(filepath):"""load primitive objects in a file"""module=os.path.basename(filepath)[:-3]# TODO: what is the first argument"?spec=importlib.util.spec_from_file_location(module,filepath)module=importlib.util.module_from_spec(spec)spec.loader.exec_module(module)primitives=[]forprimitive_nameinvars(module):primitive_class=getattr(module,primitive_name)if(isclass(primitive_class)andissubclass(primitive_class,PrimitiveBase)andprimitive_classnotin(AggregationPrimitive,TransformPrimitive)):primitives.append((primitive_name,primitive_class))iflen(primitives)==0:raiseRuntimeError("No primitive defined in file %s"%filepath)eliflen(primitives)>1:raiseRuntimeError("More than one primitive defined in file %s"%filepath)returnprimitives[0]defserialize_primitive(primitive:PrimitiveBase):"""build a dictionary with the data necessary to construct the given primitive"""args_dict={name:valforname,valinprimitive.get_arguments()}cls=type(primitive)ifcls==NumberOfCommonWordsand"word_set"inargs_dict:args_dict["word_set"]=list(args_dict["word_set"])return{"type":cls.__name__,"module":cls.__module__,"arguments":args_dict,}classPrimitivesDeserializer(object):""" This class wraps a cache and a generator which iterates over all primitive classes. When deserializing a primitive if it is not in the cache then we iterate until it is found, adding every seen class to the cache. When deserializing the next primitive the iteration resumes where it left off. This means that we never visit a class more than once. """def__init__(self):# Cache to avoid repeatedly searching for primitive class# (class_name, module_name) -> classself.class_cache={}self.primitive_classes=find_descendents(PrimitiveBase)defdeserialize_primitive(self,primitive_dict):""" Construct a primitive from the given dictionary (output from serialize_primitive). """class_name=primitive_dict["type"]module_name=primitive_dict["module"]class_cache_key=(class_name,module_name.split(".")[0])ifclass_cache_keyinself.class_cache:cls=self.class_cache[class_cache_key]else:cls=self._find_class_in_descendants(class_cache_key)ifnotcls:raiseRuntimeError('Primitive "%s" in module "%s" not found'%(class_name,module_name),)arguments=primitive_dict["arguments"]ifcls==NumberOfCommonWordsand"word_set"inarguments:# We converted word_set from a set to a list to make it serializable,# we should convert it back now.arguments["word_set"]=set(arguments["word_set"])primitive_instance=cls(**arguments)returnprimitive_instancedef_find_class_in_descendants(self,search_key):forclsinself.primitive_classes:cls_key=(cls.__name__,cls.__module__.split(".")[0])self.class_cache[cls_key]=clsifcls_key==search_key:returnclsdefget_all_logical_type_names():"""Helper function that returns all registered woodwork logical types"""return{lt.__name__:ltforltintype_system.registered_types}