import json
import os
import tarfile
import tempfile
import pandas as pd
import woodwork.type_sys.type_system as ww_type_system
from woodwork.deserialize import read_woodwork_table
from featuretools.entityset.relationship import Relationship
from featuretools.utils.gen_utils import check_schema_version
from featuretools.utils.s3_utils import get_transport_params, use_smartopen_es
from featuretools.utils.wrangle import _is_local_tar, _is_s3, _is_url
def description_to_entityset(description, **kwargs):
'''Deserialize entityset from data description.
Args:
description (dict) : Description of an :class:`.EntitySet`. Likely generated using :meth:`.serialize.entityset_to_description`
kwargs (keywords): Additional keyword arguments to pass as keywords arguments to the underlying deserialization method.
Returns:
entityset (EntitySet) : Instance of :class:`.EntitySet`.
'''
check_schema_version(description, 'entityset')
from featuretools.entityset import EntitySet
# If data description was not read from disk, path is None.
path = description.get('path')
entityset = EntitySet(description['id'])
for df in description['dataframes'].values():
if path is not None:
data_path = os.path.join(path, 'data', df['name'])
dataframe = read_woodwork_table(data_path, validate=False, **kwargs)
else:
dataframe = empty_dataframe(df)
entityset.add_dataframe(dataframe)
for relationship in description['relationships']:
rel = Relationship.from_dictionary(relationship, entityset)
entityset.add_relationship(relationship=rel)
return entityset
def empty_dataframe(description):
'''Deserialize empty dataframe from dataframe description.
Args:
description (dict) : Description of dataframe.
Returns:
df (DataFrame) : Empty dataframe with Woodwork initialized.
'''
# TODO: Can we update Woodwork to return an empty initialized dataframe from a description
# instead of using this function? Or otherwise eliminate? Issue #1476
logical_types = {}
semantic_tags = {}
column_descriptions = {}
column_metadata = {}
use_standard_tags = {}
category_dtypes = {}
columns = []
for col in description['column_typing_info']:
col_name = col['name']
columns.append(col_name)
ltype_metadata = col['logical_type']
ltype = ww_type_system.str_to_logical_type(ltype_metadata['type'], params=ltype_metadata['parameters'])
tags = col['semantic_tags']
if 'index' in tags:
tags.remove('index')
elif 'time_index' in tags:
tags.remove('time_index')
logical_types[col_name] = ltype
semantic_tags[col_name] = tags
column_descriptions[col_name] = col['description']
column_metadata[col_name] = col['metadata']
use_standard_tags[col_name] = col['use_standard_tags']
if col['physical_type']['type'] == 'category':
# Make sure categories are recreated properly
cat_values = col['physical_type']['cat_values']
cat_dtype = col['physical_type']['cat_dtype']
cat_object = pd.CategoricalDtype(pd.Index(cat_values, dtype=cat_dtype))
category_dtypes[col_name] = cat_object
dataframe = pd.DataFrame(columns=columns).astype(category_dtypes)
dataframe.ww.init(
name=description.get('name'),
index=description.get('index'),
time_index=description.get('time_index'),
logical_types=logical_types,
semantic_tags=semantic_tags,
use_standard_tags=use_standard_tags,
table_metadata=description.get('table_metadata'),
column_metadata=column_metadata,
column_descriptions=column_descriptions,
validate=False)
return dataframe
def read_data_description(path):
'''Read data description from disk, S3 path, or URL.
Args:
path (str): Location on disk, S3 path, or URL to read `data_description.json`.
Returns:
description (dict) : Description of :class:`.EntitySet`.
'''
path = os.path.abspath(path)
assert os.path.exists(path), '"{}" does not exist'.format(path)
filepath = os.path.join(path, 'data_description.json')
with open(filepath, 'r') as file:
description = json.load(file)
description['path'] = path
return description
[docs]def read_entityset(path, profile_name=None, **kwargs):
'''Read entityset from disk, S3 path, or URL.
Args:
path (str): Directory on disk, S3 path, or URL to read `data_description.json`.
profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials.
Set to False to use an anonymous profile.
kwargs (keywords): Additional keyword arguments to pass as keyword arguments to the underlying deserialization method.
'''
if _is_url(path) or _is_s3(path) or _is_local_tar(str(path)):
with tempfile.TemporaryDirectory() as tmpdir:
local_path = path
transport_params = None
if _is_s3(path):
transport_params = get_transport_params(profile_name)
if _is_s3(path) or _is_url(path):
local_path = os.path.join(tmpdir, "temporary_es")
use_smartopen_es(local_path, path, transport_params)
with tarfile.open(str(local_path)) as tar:
tar.extractall(path=tmpdir)
data_description = read_data_description(tmpdir)
return description_to_entityset(data_description, **kwargs)
else:
data_description = read_data_description(path)
return description_to_entityset(data_description, **kwargs)