NOTICE

The upcoming release of Featuretools 1.0.0 contains several breaking changes. Users are encouraged to test this version prior to release by installing from GitHub:

pip install https://github.com/alteryx/featuretools/archive/woodwork-integration.zip

For details on migrating to the new version, refer to Transitioning to Featuretools Version 1.0. Please report any issues in the Featuretools GitHub repo or by messaging in Alteryx Open Source Slack.


Source code for featuretools.entityset.deserialize

import json
import os
import tarfile
import tempfile

import pandas as pd
import woodwork.type_sys.type_system as ww_type_system
from woodwork.deserialize import read_woodwork_table

from featuretools.entityset.relationship import Relationship
from featuretools.utils.gen_utils import check_schema_version
from featuretools.utils.s3_utils import get_transport_params, use_smartopen_es
from featuretools.utils.wrangle import _is_local_tar, _is_s3, _is_url


def description_to_entityset(description, **kwargs):
    '''Deserialize entityset from data description.

    Args:
        description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
        kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.

    Returns:
        entityset (EntitySet) : Instance of :class:`.EntitySet`.
    '''
    check_schema_version(description, 'entityset')

    from featuretools.entityset import EntitySet

    # If data description was not read from disk, path is None.
    path = description.get('path')
    entityset = EntitySet(description['id'])

    for df in description['dataframes'].values():
        if path is not None:
            data_path = os.path.join(path, 'data', df['name'])
            dataframe = read_woodwork_table(data_path, validate=False, **kwargs)
        else:
            dataframe = empty_dataframe(df)

        entityset.add_dataframe(dataframe)

    for relationship in description['relationships']:
        rel = Relationship.from_dictionary(relationship, entityset)
        entityset.add_relationship(relationship=rel)

    return entityset


def empty_dataframe(description):
    '''Deserialize empty dataframe from dataframe description.

    Args:
        description (dict) : Description of dataframe.

    Returns:
        df (DataFrame) : Empty dataframe with Woodwork initialized.
    '''
    # TODO: Can we update Woodwork to return an empty initialized dataframe from a description
    # instead of using this function? Or otherwise eliminate? Issue #1476
    logical_types = {}
    semantic_tags = {}
    column_descriptions = {}
    column_metadata = {}
    use_standard_tags = {}
    category_dtypes = {}
    columns = []
    for col in description['column_typing_info']:
        col_name = col['name']
        columns.append(col_name)

        ltype_metadata = col['logical_type']
        ltype = ww_type_system.str_to_logical_type(ltype_metadata['type'], params=ltype_metadata['parameters'])

        tags = col['semantic_tags']

        if 'index' in tags:
            tags.remove('index')
        elif 'time_index' in tags:
            tags.remove('time_index')

        logical_types[col_name] = ltype
        semantic_tags[col_name] = tags
        column_descriptions[col_name] = col['description']
        column_metadata[col_name] = col['metadata']
        use_standard_tags[col_name] = col['use_standard_tags']

        if col['physical_type']['type'] == 'category':
            # Make sure categories are recreated properly
            cat_values = col['physical_type']['cat_values']
            cat_dtype = col['physical_type']['cat_dtype']
            cat_object = pd.CategoricalDtype(pd.Index(cat_values, dtype=cat_dtype))
            category_dtypes[col_name] = cat_object
    dataframe = pd.DataFrame(columns=columns).astype(category_dtypes)

    dataframe.ww.init(
        name=description.get('name'),
        index=description.get('index'),
        time_index=description.get('time_index'),
        logical_types=logical_types,
        semantic_tags=semantic_tags,
        use_standard_tags=use_standard_tags,
        table_metadata=description.get('table_metadata'),
        column_metadata=column_metadata,
        column_descriptions=column_descriptions,
        validate=False)
    return dataframe


def read_data_description(path):
    '''Read data description from disk, S3 path, or URL.

        Args:
            path (str): Location on disk, S3 path, or URL to read `data_description.json`.

        Returns:
            description (dict) : Description of :class:`.EntitySet`.
    '''

    path = os.path.abspath(path)
    assert os.path.exists(path), '"{}" does not exist'.format(path)
    filepath = os.path.join(path, 'data_description.json')
    with open(filepath, 'r') as file:
        description = json.load(file)
    description['path'] = path
    return description


[docs]def read_entityset(path, profile_name=None, **kwargs): '''Read entityset from disk, S3 path, or URL. Args: path (str): Directory on disk, S3 path, or URL to read `data_description.json`. profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials. Set to False to use an anonymous profile. kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method. ''' if _is_url(path) or _is_s3(path) or _is_local_tar(str(path)): with tempfile.TemporaryDirectory() as tmpdir: local_path = path transport_params = None if _is_s3(path): transport_params = get_transport_params(profile_name) if _is_s3(path) or _is_url(path): local_path = os.path.join(tmpdir, "temporary_es") use_smartopen_es(local_path, path, transport_params) with tarfile.open(str(local_path)) as tar: tar.extractall(path=tmpdir) data_description = read_data_description(tmpdir) return description_to_entityset(data_description, **kwargs) else: data_description = read_data_description(path) return description_to_entityset(data_description, **kwargs)