NOTICE
The upcoming release of Featuretools 1.0.0 contains several breaking changes. Users are encouraged to test this version prior to release by installing from GitHub:
pip install https://github.com/alteryx/featuretools/archive/woodwork-integration.zip
For details on migrating to the new version, refer to Transitioning to Featuretools Version 1.0. Please report any issues in the Featuretools GitHub repo or by messaging in Alteryx Open Source Slack.
import json import os import tarfile import tempfile import pandas as pd import woodwork.type_sys.type_system as ww_type_system from woodwork.deserialize import read_woodwork_table from featuretools.entityset.relationship import Relationship from featuretools.utils.gen_utils import check_schema_version from featuretools.utils.s3_utils import get_transport_params, use_smartopen_es from featuretools.utils.wrangle import _is_local_tar, _is_s3, _is_url def description_to_entityset(description, **kwargs): '''Deserialize entityset from data description. Args: description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description` kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method. Returns: entityset (EntitySet) : Instance of :class:`.EntitySet`. ''' check_schema_version(description, 'entityset') from featuretools.entityset import EntitySet # If data description was not read from disk, path is None. path = description.get('path') entityset = EntitySet(description['id']) for df in description['dataframes'].values(): if path is not None: data_path = os.path.join(path, 'data', df['name']) dataframe = read_woodwork_table(data_path, validate=False, **kwargs) else: dataframe = empty_dataframe(df) entityset.add_dataframe(dataframe) for relationship in description['relationships']: rel = Relationship.from_dictionary(relationship, entityset) entityset.add_relationship(relationship=rel) return entityset def empty_dataframe(description): '''Deserialize empty dataframe from dataframe description. Args: description (dict) : Description of dataframe. Returns: df (DataFrame) : Empty dataframe with Woodwork initialized. ''' # TODO: Can we update Woodwork to return an empty initialized dataframe from a description # instead of using this function? Or otherwise eliminate? Issue #1476 logical_types = {} semantic_tags = {} column_descriptions = {} column_metadata = {} use_standard_tags = {} category_dtypes = {} columns = [] for col in description['column_typing_info']: col_name = col['name'] columns.append(col_name) ltype_metadata = col['logical_type'] ltype = ww_type_system.str_to_logical_type(ltype_metadata['type'], params=ltype_metadata['parameters']) tags = col['semantic_tags'] if 'index' in tags: tags.remove('index') elif 'time_index' in tags: tags.remove('time_index') logical_types[col_name] = ltype semantic_tags[col_name] = tags column_descriptions[col_name] = col['description'] column_metadata[col_name] = col['metadata'] use_standard_tags[col_name] = col['use_standard_tags'] if col['physical_type']['type'] == 'category': # Make sure categories are recreated properly cat_values = col['physical_type']['cat_values'] cat_dtype = col['physical_type']['cat_dtype'] cat_object = pd.CategoricalDtype(pd.Index(cat_values, dtype=cat_dtype)) category_dtypes[col_name] = cat_object dataframe = pd.DataFrame(columns=columns).astype(category_dtypes) dataframe.ww.init( name=description.get('name'), index=description.get('index'), time_index=description.get('time_index'), logical_types=logical_types, semantic_tags=semantic_tags, use_standard_tags=use_standard_tags, table_metadata=description.get('table_metadata'), column_metadata=column_metadata, column_descriptions=column_descriptions, validate=False) return dataframe def read_data_description(path): '''Read data description from disk, S3 path, or URL. Args: path (str): Location on disk, S3 path, or URL to read `data_description.json`. Returns: description (dict) : Description of :class:`.EntitySet`. ''' path = os.path.abspath(path) assert os.path.exists(path), '"{}" does not exist'.format(path) filepath = os.path.join(path, 'data_description.json') with open(filepath, 'r') as file: description = json.load(file) description['path'] = path return description [docs]def read_entityset(path, profile_name=None, **kwargs): '''Read entityset from disk, S3 path, or URL. Args: path (str): Directory on disk, S3 path, or URL to read `data_description.json`. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method. ''' if _is_url(path) or _is_s3(path) or _is_local_tar(str(path)): with tempfile.TemporaryDirectory() as tmpdir: local_path = path transport_params = None if _is_s3(path): transport_params = get_transport_params(profile_name) if _is_s3(path) or _is_url(path): local_path = os.path.join(tmpdir, "temporary_es") use_smartopen_es(local_path, path, transport_params) with tarfile.open(str(local_path)) as tar: tar.extractall(path=tmpdir) data_description = read_data_description(tmpdir) return description_to_entityset(data_description, **kwargs) else: data_description = read_data_description(path) return description_to_entityset(data_description, **kwargs)